Python DatasetNetCDF.sync Exemples, geodata.netcdf.DatasetNetCDF.sync Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : wrfavg.py Projet : EdwardBetts/GeoPy

def computeClimatology(experiment, filetype, domain, periods=None, offset=0, griddef=None, varlist=None, 
                       ldebug=False, loverwrite=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to compute climatologies for given file parameters. '''
  # input type checks
  if not isinstance(experiment,Exp): raise TypeError
  if not isinstance(filetype,basestring): raise TypeError
  if not isinstance(domain,(np.integer,int)): raise TypeError
  if periods is not None and not (isinstance(periods,(tuple,list)) and isInt(periods)): raise TypeError
  if not isinstance(offset,(np.integer,int)): raise TypeError
  if not isinstance(loverwrite,(bool,np.bool)): raise TypeError  
  if griddef is not None and not isinstance(griddef,GridDefinition): raise TypeError
  
  #if pidstr == '[proc01]': raise TypeError # to test error handling

  # load source
  dataset_name = experiment.name
  fileclass = fileclasses[filetype] # used for target file name
  tsfile = fileclass.tsfile.format(domain,'')
  expfolder = experiment.avgfolder
  filepath = '{:s}/{:s}'.format(expfolder, tsfile)
  logger.info('\n\n{0:s}   ***   Processing Experiment {1:<15s}   ***   '.format(pidstr,"'{:s}'".format(dataset_name)) +
        '\n{0:s}   ***   {1:^37s}   ***   \n'.format(pidstr,"'{:s}'".format(tsfile)))
  
  # check file and read begin/enddates
  if not os.path.exists(filepath): 
    #raise IOError, "Source file '{:s}' does not exist!".format(filepath)
    # print message and skip
    skipmsg =  "\n{:s}   >>>   File '{:s}' in dataset '{:s}' is missing --- skipping!".format(pidstr,tsfile,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
    logger.warning(skipmsg) 
    # N.B.: this can cause a lot of error messages, when not all files are present
  else: # if monthly source file exists
    import netCDF4 as nc
    ncfile = nc.Dataset(filepath,mode='r')
    begintuple = ncfile.begin_date.split('-')
    endtuple = ncfile.end_date.split('-')
    ncfile.close()
    # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even
    #       know if we need it, and it creates a lot of overhead
    
    # determine age of source file
    if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath))
  
    # figure out start date
    filebegin = int(begintuple[0]) # first element is the year
    fileend = int(endtuple[0]) # first element is the year
    begindate = offset + filebegin
    if not ( filebegin <= begindate <= fileend ): raise DateError  
    # handle cases where the first month in the record is not January
    firstmonth = int(begintuple[1]) # second element is the month
    shift = firstmonth-1 # will be zero for January (01)
    
    ## loop over periods
    if periods is None: periods = [begindate-fileend]
    #   periods.sort(reverse=True) # reverse, so that largest chunk is done first
    source = None # will later be assigned to the source dataset
    for period in periods:       
              
      # figure out period
      enddate = begindate + period     
      if filebegin > enddate: raise DateError, 'End date earlier than begin date.'
      if enddate-1 > fileend: # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989!
        # if end date is not available, skip period
        endmsg = "\n{:s}   ---   Invalid Period for '{:s}': End Date {:4d} not in File!   ---   \n".format(pidstr,dataset_name,enddate)
        endmsg += "{:s}   ---   ('{:s}')\n".format(pidstr,filepath)
        logger.info(endmsg)
        
      else: ## perform averaging for selected period
  
        # determine if sink file already exists, and what to do about it      
        periodstr = '{0:4d}-{1:4d}'.format(begindate,enddate)
        gridstr = '' if griddef is None or griddef.name is 'WRF' else '_'+griddef.name
        filename = fileclass.climfile.format(domain,gridstr,'_'+periodstr)
        if ldebug: filename = 'test_' + filename
        if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1])
        else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1])
        tmpfilename = tmppfx + filename
        assert os.path.exists(expfolder)
        filepath = expfolder+filename
        tmpfilepath = expfolder+tmpfilename
        lskip = False # else just go ahead
        if os.path.exists(filepath): 
          if not loverwrite: 
            age = datetime.fromtimestamp(os.path.getmtime(filepath))
            # if sink file is newer than source file, skip (do not recompute)
            if age > sourceage and os.path.getsize(filepath) > 1e6: lskip = True
            # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash
            #print sourceage, age
          if not lskip: os.remove(filepath) 
        
        # depending on last modification time of file or overwrite setting, start computation, or skip
        if lskip:        
          # print message
          skipmsg =  "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name)
          skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
          logger.info(skipmsg)              
        else:
           
          ## begin actual computation
          beginmsg = "\n{:s}   <<<   Computing '{:s}' (d{:02d}) Climatology from {:s}".format(
                      pidstr,dataset_name,domain,periodstr)
          if griddef is None: beginmsg += "  >>>   \n" 
          else: beginmsg += " ('{:s}' grid)  >>>   \n".format(griddef.name)
          logger.info(beginmsg)
  
          ## actually load datasets
          if source is None:
            source = loadWRF_TS(experiment=experiment, filetypes=[filetype], domains=domain) # comes out as a tuple... 
          if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
  
          # prepare sink
          if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files
          sink = DatasetNetCDF(name='WRF Climatology', folder=expfolder, filelist=[tmpfilename], atts=source.atts.copy(), mode='w')
          sink.atts.period = periodstr 
          
          # initialize processing
          if griddef is None: lregrid = False
          else: lregrid = True
          CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=lregrid, feedback=ldebug) # no need for lat/lon
          
          # start processing climatology
          if shift != 0: 
            logger.info('{0:s}   (shifting climatology by {1:d} month, to start with January)   \n'.format(pidstr,shift))
          CPU.Climatology(period=period, offset=offset, shift=shift, flush=False)
          # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller!
          
          # reproject and resample (regrid) dataset
          if lregrid:
            CPU.Regrid(griddef=griddef, flush=True)
            logger.info('%s    ---   '+str(griddef.geotansform)+'   ---   \n'%(pidstr))              
          
          # sync temporary storage with output dataset (sink)
          CPU.sync(flush=True)
          
          # add Geopotential Height Variance
          if 'GHT_Var' in sink and 'Z_var' not in sink:
            data_array = ( sink['GHT_Var'].data_array - sink['Z'].data_array**2 )**0.5
            atts = dict(name='Z_var',units='m',long_name='Square Root of Geopotential Height Variance')
            sink += Variable(axes=sink['Z'].axes, data=data_array, atts=atts)
            
          # add (relative) Vorticity Variance
          if 'Vorticity_Var' in sink and 'zeta_var' not in sink:
            data_array = ( sink['Vorticity_Var'].data_array - sink['zeta'].data_array**2 )**0.5
            atts = dict(name='zeta_var',units='1/s',long_name='Square Root of Relative Vorticity Variance')
            sink += Variable(axes=sink['zeta'].axes, data=data_array, atts=atts)
            
          # add names and length of months
          sink.axisAnnotation('name_of_month', name_of_month, 'time', 
                              atts=dict(name='name_of_month', units='', long_name='Name of the Month'))        
          if not sink.hasVariable('length_of_month'):
            sink += Variable(name='length_of_month', units='days', axes=(sink.time,), data=days_per_month,
                          atts=dict(name='length_of_month',units='days',long_name='Length of Month'))
          
          # close... and write results to file
          sink.sync()
          sink.close()
          writemsg =  "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name)
          writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
          logger.info(writemsg)      
          # rename file to proper name
          if os.path.exists(filepath): os.remove(filepath) # remove old file
          os.rename(tmpfilepath,filepath) # this will overwrite the old file
          
          # print dataset
          if not lparallel and ldebug:
            logger.info('\n'+str(sink)+'\n')
          
          # clean up (not sure if this is necessary, but there seems to be a memory leak...   
          del sink, CPU; gc.collect() # get rid of these guys immediately
          
    # clean up and return
    if source is not None: source.unload(); del source
    # N.B.: source is only loaded once for all periods    

  # N.B.: garbage is collected in multi-processing wrapper as well
  # return
  return 0 # so far, there is no measure of success, hence, if there is no crash...

Exemple #2

0

Afficher le fichier

Fichier : exstns.py Projet : EdwardBetts/GeoPy

def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False,
                      ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to extract point data from gridded dataset '''  
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  if not callable(stnfct): raise TypeError # function to load station dataset
  if lparallel: 
    if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).'
    if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  lclim = False; lts = False
  if mode == 'climatology': lclim = True
  elif mode == 'time-series': lts = True
  else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
  
  ## extract meta data from arguments
  module, dataargs, loadfct, filepath, datamsgstr = getMetaData(dataset, mode, dataargs)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder

  # load template dataset
  stndata = stnfct() # load station dataset from function
  if not isinstance(stndata, Dataset): raise TypeError
  # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well 
            
  # determine age of source file
  if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath))    
          
  # get filename for target dataset and do some checks
  filename = getTargetFile(stndata.name, dataset, mode, module, dataargs, lwrite)
  if ldebug: filename = 'test_' + filename
  if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder)
  lskip = False # else just go ahead
  if lwrite:
    if lreturn: 
      tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!)
    else: 
      if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1])
      else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1])
      tmpfilename = tmppfx + filename      
    filepath = avgfolder + filename
    tmpfilepath = avgfolder + tmpfilename
    if os.path.exists(filepath): 
      if not loverwrite: 
        age = datetime.fromtimestamp(os.path.getmtime(filepath))
        # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
        if age > sourceage and os.path.getsize(filepath) > 1e5: lskip = True
        # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed
      if not lskip: os.remove(filepath) # recompute
  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
    logger.info(skipmsg)              
  else:
          
    ## actually load datasets
    source = loadfct() # load source 
    # check period
    if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)
  
    # print message
    if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(stndata.name, periodstr)
    elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(stndata.name)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')  
    
    ## create new sink/target file
    # set attributes   
    atts=source.atts.copy()
    atts['period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' 
    atts['name'] = dataset_name; atts['station'] = stndata.name
    atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(stndata.title,dataset_name,mode.title())
    # make new dataset
    if lwrite: # write to NetCDF file 
      if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files 
      sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w')
    else: sink = Dataset(atts=atts) # ony create dataset in memory
    
    # initialize processing
    CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug)
  
    # extract data at station locations
    CPU.Extract(template=stndata, flush=True)
    # get results    
    CPU.sync(flush=True)
    
    # print dataset
    if not lparallel and ldebug:
      logger.info('\n'+str(sink)+'\n')   
    # write results to file
    if lwrite:
      sink.sync()
      writemsg =  "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
      logger.info(writemsg)      
      
      # rename file to proper name
      if not lreturn:
        sink.unload(); sink.close(); del sink # destroy all references 
        if os.path.exists(filepath): os.remove(filepath) # remove old file
        os.rename(tmpfilepath,filepath)
      # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed
        
    # clean up and return
    source.unload(); del source#, CPU
    if lreturn:      
      return sink # return dataset for further use (netcdf file still open!)
    else:            
      return 0 # "exit code"

Exemple #3

0

Afficher le fichier

Fichier : GPCC.py Projet : xiefengy/GeoPy

      #sink.mask(sink.landmask)
      #print sink.dataset
      addLandMask(sink) # create landmask from precip mask
      #sink.stations.mask(sink.landmask) # mask all fields using the new landmask
      # add length and names of month
      addLengthAndNamesOfMonth(sink, noleap=False) 
                    
#       newvar = sink.precip
#       print
#       print newvar.name, newvar.masked
#       print newvar.fillValue
#       print newvar.data_array.__class__
#       print
      
      # close...
      sink.sync()
      sink.close()
      # print dataset
      print('')
      print(sink)
      del sink     
      print

#       # print time coordinate
#       dataset = loadGPCC(grid=grid,resolution=res,period=period)      
#       print dataset
#       print
#       print dataset.time
#       print
#       print dataset.time.data_array

Exemple #4

0

Afficher le fichier

Fichier : regrid.py Projet : zhenkunl/GeoPy

def performRegridding(dataset,
                      mode,
                      griddef,
                      dataargs,
                      loverwrite=False,
                      varlist=None,
                      lwrite=True,
                      lreturn=False,
                      ldebug=False,
                      lparallel=False,
                      pidstr='',
                      logger=None):
    ''' worker function to perform regridding for a given dataset and target grid '''
    # input checking
    if not isinstance(dataset, basestring): raise TypeError
    if not isinstance(dataargs, dict):
        raise TypeError  # all dataset arguments are kwargs
    if not isinstance(griddef, GridDefinition): raise TypeError
    if lparallel:
        if not lwrite:
            raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).'
        if lreturn:
            raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'

    # logging
    if logger is None:  # make new logger
        logger = logging.getLogger()  # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger, basestring):
            logger = logging.getLogger(name=logger)  # connect to existing one
        elif not isinstance(logger, logging.Logger):
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(
                str(logger))

    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(
        dataset, mode, dataargs)
    dataset_name = dataargs.dataset_name
    periodstr = dataargs.periodstr
    avgfolder = dataargs.avgfolder

    # get filename for target dataset and do some checks
    filename = getTargetFile(
        dataset=dataset,
        mode=mode,
        dataargs=dataargs,
        lwrite=lwrite,
        grid=griddef.name.lower(),
    )

    # prepare target dataset
    if ldebug: filename = 'test_' + filename
    if not os.path.exists(avgfolder):
        raise IOError, "Dataset folder '{:s}' does not exist!".format(
            avgfolder)
    lskip = False  # else just go ahead
    if lwrite:
        if lreturn:
            tmpfilename = filename  # no temporary file if dataset is passed on (can't rename the file while it is open!)
        else:
            if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1])
            else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1])
            tmpfilename = tmppfx + filename
        filepath = avgfolder + filename
        tmpfilepath = avgfolder + tmpfilename
        if os.path.exists(filepath):
            if not loverwrite:
                age = datetime.fromtimestamp(os.path.getmtime(filepath))
                # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
                if age > srcage and os.path.getsize(filepath) > 1e6:
                    lskip = True
                    if hasattr(griddef,
                               'filepath') and griddef.filepath is not None:
                        gridage = datetime.fromtimestamp(
                            os.path.getmtime(griddef.filepath))
                        if age < gridage: lskip = False
                # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed

    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:
        # print message
        skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
            pidstr, filename, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.info(skipmsg)
    else:

        ## actually load datasets
        source = loadfct()  # load source
        # check period
        if 'period' in source.atts and dataargs.periodstr != source.atts.period:  # a NetCDF attribute
            raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(
                periodstr, source.atts.period)

        # print message
        if mode == 'climatology':
            opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(
                periodstr, griddef.name)
        elif mode == 'time-series':
            opmsgstr = 'Regridding Time-series to {:s} Grid'.format(
                griddef.name)
        else:
            raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
        # print feedback to logger
        logger.info(
            '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'
            .format(pidstr, datamsgstr, opmsgstr))
        if not lparallel and ldebug: logger.info('\n' + str(source) + '\n')

        ## create new sink/target file
        # set attributes
        atts = source.atts.copy()
        atts['period'] = periodstr
        atts['name'] = dataset_name
        atts['grid'] = griddef.name
        if mode == 'climatology':
            atts['title'] = '{:s} Climatology on {:s} Grid'.format(
                dataset_name, griddef.name)
        elif mode == 'time-series':
            atts['title'] = '{:s} Time-series on {:s} Grid'.format(
                dataset_name, griddef.name)

        # make new dataset
        if lwrite:  # write to NetCDF file
            if os.path.exists(tmpfilepath):
                os.remove(tmpfilepath)  # remove old temp files
            sink = DatasetNetCDF(folder=avgfolder,
                                 filelist=[tmpfilename],
                                 atts=atts,
                                 mode='w')
        else:
            sink = Dataset(atts=atts)  # ony create dataset in memory

        # initialize processing
        CPU = CentralProcessingUnit(source,
                                    sink,
                                    varlist=varlist,
                                    tmp=False,
                                    feedback=ldebug)

        # perform regridding (if target grid is different from native grid!)
        if griddef.name != dataset:
            # reproject and resample (regrid) dataset
            CPU.Regrid(griddef=griddef, flush=True)

        # get results
        CPU.sync(flush=True)

        # add geolocators
        sink = addGeoLocator(sink,
                             griddef=griddef,
                             lgdal=True,
                             lreplace=True,
                             lcheck=True)
        # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those!

        # add length and names of month
        if mode == 'climatology' and not sink.hasVariable(
                'length_of_month') and sink.hasVariable('time'):
            addLengthAndNamesOfMonth(
                sink,
                noleap=True if dataset.upper() in ('WRF', 'CESM') else False)

        # print dataset
        if not lparallel and ldebug:
            logger.info('\n' + str(sink) + '\n')
        # write results to file
        if lwrite:
            sink.sync()
            writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                pidstr, filename, dataset_name)
            writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
            logger.info(writemsg)

            # rename file to proper name
            if not lreturn:
                sink.unload()
                sink.close()
                del sink  # destroy all references
                if os.path.exists(filepath):
                    os.remove(filepath)  # remove old file
                os.rename(
                    tmpfilepath,
                    filepath)  # this would also overwrite the old file...
            # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed

        # clean up and return
        source.unload()
        del source, CPU
        if lreturn:
            return sink  # return dataset for further use (netcdf file still open!)
        else:
            return 0  # "exit code"

Exemple #5

0

Afficher le fichier

Fichier : NARR.py Projet : zhenkunl/GeoPy

    sink.atts.period = periodstr 
    
    # determine averaging interval
    offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 
    # initialize processing
#     CPU = CentralProcessingUnit(source, sink, varlist=['precip', 'T2'], tmp=True) # no need for lat/lon
    CPU = CentralProcessingUnit(source, sink, varlist=None, tmp=True) # no need for lat/lon
    
    # start processing climatology
    CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False)
    
    # sync temporary storage with output
    CPU.sync(flush=True)

#     # make new masks
#     sink.mask(sink.landmask, maskSelf=False, varlist=['snow','snowh','zs'], invert=True, merge=False)

    # add names and length of months
    sink.axisAnnotation('name_of_month', name_of_month, 'time', 
                        atts=dict(name='name_of_month', units='', long_name='Name of the Month'))
    #print '   ===   month   ===   '
#     sink += VarNC(sink.dataset, name='length_of_month', units='days', axes=(sink.time,), data=days_per_month,
#                   atts=dict(name='length_of_month',units='days',long_name='Length of Month'))
    
    # close...
    sink.sync()
    sink.close()
    # print dataset
    print('')
    print(sink)

Exemple #6

0

Afficher le fichier

def performExtraction(dataset,
                      mode,
                      stnfct,
                      dataargs,
                      loverwrite=False,
                      varlist=None,
                      lwrite=True,
                      lreturn=False,
                      ldebug=False,
                      lparallel=False,
                      pidstr='',
                      logger=None):
    ''' worker function to extract point data from gridded dataset '''
    # input checking
    if not isinstance(dataset, basestring): raise TypeError
    if not isinstance(dataargs, dict):
        raise TypeError  # all dataset arguments are kwargs
    if not callable(stnfct):
        raise TypeError  # function to load station dataset
    if lparallel:
        if not lwrite:
            raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).'
        if lreturn:
            raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'

    # logging
    if logger is None:  # make new logger
        logger = logging.getLogger()  # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger, basestring):
            logger = logging.getLogger(name=logger)  # connect to existing one
        elif not isinstance(logger, logging.Logger):
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(
                str(logger))

    lclim = False
    lts = False
    if mode == 'climatology': lclim = True
    elif mode == 'time-series': lts = True
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)

    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(
        dataset, mode, dataargs)
    dataset_name = dataargs.dataset_name
    periodstr = dataargs.periodstr
    avgfolder = dataargs.avgfolder

    # load template dataset
    stndata = stnfct()  # load station dataset from function
    if not isinstance(stndata, Dataset): raise TypeError
    # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well

    # get filename for target dataset and do some checks
    filename = getTargetFile(dataset=dataset,
                             mode=mode,
                             dataargs=dataargs,
                             lwrite=lwrite,
                             station=stndata.name)

    if ldebug: filename = 'test_' + filename
    if not os.path.exists(avgfolder):
        raise IOError, "Dataset folder '{:s}' does not exist!".format(
            avgfolder)
    lskip = False  # else just go ahead
    if lwrite:
        if lreturn:
            tmpfilename = filename  # no temporary file if dataset is passed on (can't rename the file while it is open!)
        else:
            if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1])
            else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1])
            tmpfilename = tmppfx + filename
        filepath = avgfolder + filename
        tmpfilepath = avgfolder + tmpfilename
        if os.path.exists(filepath):
            if not loverwrite:
                age = datetime.fromtimestamp(os.path.getmtime(filepath))
                # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
                if age > srcage and os.path.getsize(filepath) > 1e5:
                    lskip = True
                # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed

    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:
        # print message
        skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
            pidstr, filename, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.info(skipmsg)
    else:

        ## actually load datasets
        source = loadfct()  # load source
        # check period
        if 'period' in source.atts and dataargs.periodstr != source.atts.period:  # a NetCDF attribute
            raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(
                periodstr, source.atts.period)

        # print message
        if lclim:
            opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(
                stndata.name, periodstr)
        elif lts:
            opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(
                stndata.name)
        else:
            raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)
        # print feedback to logger
        logger.info(
            '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'
            .format(pidstr, datamsgstr, opmsgstr))
        if not lparallel and ldebug: logger.info('\n' + str(source) + '\n')

        ## create new sink/target file
        # set attributes
        atts = source.atts.copy()
        atts[
            'period'] = dataargs.periodstr if dataargs.periodstr else 'time-series'
        atts['name'] = dataset_name
        atts['station'] = stndata.name
        atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(
            stndata.title, dataset_name, mode.title())
        # make new dataset
        if lwrite:  # write to NetCDF file
            if os.path.exists(tmpfilepath):
                os.remove(tmpfilepath)  # remove old temp files
            sink = DatasetNetCDF(folder=avgfolder,
                                 filelist=[tmpfilename],
                                 atts=atts,
                                 mode='w')
        else:
            sink = Dataset(atts=atts)  # ony create dataset in memory

        # initialize processing
        CPU = CentralProcessingUnit(source,
                                    sink,
                                    varlist=varlist,
                                    tmp=False,
                                    feedback=ldebug)

        # extract data at station locations
        CPU.Extract(template=stndata, flush=True)
        # get results
        CPU.sync(flush=True)

        # print dataset
        if not lparallel and ldebug:
            logger.info('\n' + str(sink) + '\n')
        # write results to file
        if lwrite:
            sink.sync()
            writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                pidstr, filename, dataset_name)
            writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
            logger.info(writemsg)

            # rename file to proper name
            if not lreturn:
                sink.unload()
                sink.close()
                del sink  # destroy all references
                if os.path.exists(filepath):
                    os.remove(filepath)  # remove old file
                os.rename(tmpfilepath, filepath)
            # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed

        # clean up and return
        source.unload()
        del source  #, CPU
        if lreturn:
            return sink  # return dataset for further use (netcdf file still open!)
        else:
            return 0  # "exit code"

Exemple #7

0

Afficher le fichier

def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, 
                      lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None):
  ''' worker function to perform regridding for a given dataset and target grid '''
  # input checking
  if not isinstance(dataset,basestring): raise TypeError
  if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
  if not isinstance(griddef,GridDefinition): raise TypeError
  if lparallel: 
    if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).'
    if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).'
  
  # logging
  if logger is None: # make new logger     
    logger = logging.getLogger() # new logger
    logger.addHandler(logging.StreamHandler())
  else:
    if isinstance(logger,basestring): 
      logger = logging.getLogger(name=logger) # connect to existing one
    elif not isinstance(logger,logging.Logger): 
      raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))

  ## extract meta data from arguments
  dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs)
  dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder

  # get filename for target dataset and do some checks
  filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, 
                           grid=griddef.name.lower(), period=None, filetype=None) 
    
  # prepare target dataset
  if ldebug: filename = 'test_' + filename
  if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder)
  lskip = False # else just go ahead
  if lwrite:
    if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!)
    else: 
      if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1])
      else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1])
      tmpfilename = tmppfx + filename      
    filepath = avgfolder + filename
    tmpfilepath = avgfolder + tmpfilename
    if os.path.exists(filepath): 
      if not loverwrite: 
        age = datetime.fromtimestamp(os.path.getmtime(filepath))
        # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip
        if age > srcage and os.path.getsize(filepath) > 1e6: 
          lskip = True
          if hasattr(griddef, 'filepath') and griddef.filepath is not None:
            gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath))
            if age < gridage: lskip = False
        # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed
      if not lskip: os.remove(filepath) # recompute
  
  # depending on last modification time of file or overwrite setting, start computation, or skip
  if lskip:        
    # print message
    skipmsg =  "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name)
    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
    logger.info(skipmsg)              
  else:
          
    ## actually load datasets
    source = loadfct() # load source 
    # check period
    if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
      raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)

    # print message
    if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name)
    elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name)
    else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
    # print feedback to logger
    logger.info('\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr))
    if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
    
    ## create new sink/target file
    # set attributes   
    atts=source.atts.copy()
    atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name
    if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name)
    elif mode == 'time-series':  atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name)
      
    # make new dataset
    if lwrite: # write to NetCDF file 
      if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files 
      sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w')
    else: sink = Dataset(atts=atts) # ony create dataset in memory
    
    # initialize processing
    CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug)
  
    # perform regridding (if target grid is different from native grid!)
    if griddef.name != dataset:
      # reproject and resample (regrid) dataset
      CPU.Regrid(griddef=griddef, flush=True)

    # get results    
    CPU.sync(flush=True)
    
    # add geolocators
    sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True)
    # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those!
    
    # add length and names of month
    if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): 
      addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) 
    
    # print dataset
    if not lparallel and ldebug:
      logger.info('\n'+str(sink)+'\n')   
    # write results to file
    if lwrite:
      sink.sync()
      writemsg =  "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,filepath)
      logger.info(writemsg)      
      
      # rename file to proper name
      if not lreturn:
        sink.unload(); sink.close(); del sink # destroy all references 
        if os.path.exists(filepath): os.remove(filepath) # remove old file
        os.rename(tmpfilepath,filepath) # this would also overwrite the old file...
      # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed
        
    # clean up and return
    source.unload(); del source, CPU
    if lreturn:      
      return sink # return dataset for further use (netcdf file still open!)
    else:            
      return 0 # "exit code"

Exemple #8

0

Afficher le fichier

def computeClimatology(experiment,
                       filetype,
                       domain,
                       periods=None,
                       offset=0,
                       griddef=None,
                       varlist=None,
                       ldebug=False,
                       loverwrite=False,
                       lparallel=False,
                       pidstr='',
                       logger=None):
    ''' worker function to compute climatologies for given file parameters. '''
    # input type checks
    if not isinstance(experiment, Exp): raise TypeError
    if not isinstance(filetype, basestring): raise TypeError
    if not isinstance(domain, (np.integer, int)): raise TypeError
    if periods is not None and not (isinstance(periods, (tuple, list))
                                    and isInt(periods)):
        raise TypeError
    if not isinstance(offset, (np.integer, int)): raise TypeError
    if not isinstance(loverwrite, (bool, np.bool)): raise TypeError
    if griddef is not None and not isinstance(griddef, GridDefinition):
        raise TypeError

    #if pidstr == '[proc01]': raise TypeError # to test error handling

    # load source
    dataset_name = experiment.name
    fileclass = fileclasses[filetype]  # used for target file name
    tsfile = fileclass.tsfile.format(domain, '')
    expfolder = experiment.avgfolder
    filepath = '{:s}/{:s}'.format(expfolder, tsfile)
    logger.info('\n\n{0:s}   ***   Processing Experiment {1:<15s}   ***   '.
                format(pidstr, "'{:s}'".format(dataset_name)) +
                '\n{0:s}   ***   {1:^37s}   ***   \n'.format(
                    pidstr, "'{:s}'".format(tsfile)))

    # check file and read begin/enddates
    if not os.path.exists(filepath):
        #raise IOError, "Source file '{:s}' does not exist!".format(filepath)
        # print message and skip
        skipmsg = "\n{:s}   >>>   File '{:s}' in dataset '{:s}' is missing --- skipping!".format(
            pidstr, tsfile, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.warning(skipmsg)
        # N.B.: this can cause a lot of error messages, when not all files are present
    else:  # if monthly source file exists
        import netCDF4 as nc
        ncfile = nc.Dataset(filepath, mode='r')
        begintuple = ncfile.begin_date.split('-')
        endtuple = ncfile.end_date.split('-')
        ncfile.close()
        # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even
        #       know if we need it, and it creates a lot of overhead

        # determine age of source file
        if not loverwrite:
            sourceage = datetime.fromtimestamp(os.path.getmtime(filepath))

        # figure out start date
        filebegin = int(begintuple[0])  # first element is the year
        fileend = int(endtuple[0])  # first element is the year
        begindate = offset + filebegin
        if not (filebegin <= begindate <= fileend): raise DateError
        # handle cases where the first month in the record is not January
        firstmonth = int(begintuple[1])  # second element is the month
        shift = firstmonth - 1  # will be zero for January (01)

        ## loop over periods
        if periods is None: periods = [begindate - fileend]
        #   periods.sort(reverse=True) # reverse, so that largest chunk is done first
        source = None  # will later be assigned to the source dataset
        for period in periods:

            # figure out period
            enddate = begindate + period
            if filebegin > enddate:
                raise DateError, 'End date earlier than begin date.'
            if enddate - 1 > fileend:  # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989!
                # if end date is not available, skip period
                endmsg = "\n{:s}   ---   Invalid Period for '{:s}': End Date {:4d} not in File!   ---   \n".format(
                    pidstr, dataset_name, enddate)
                endmsg += "{:s}   ---   ('{:s}')\n".format(pidstr, filepath)
                logger.info(endmsg)

            else:  ## perform averaging for selected period

                # determine if sink file already exists, and what to do about it
                periodstr = '{0:4d}-{1:4d}'.format(begindate, enddate)
                gridstr = '' if griddef is None or griddef.name is 'WRF' else '_' + griddef.name
                filename = fileclass.climfile.format(domain, gridstr,
                                                     '_' + periodstr)
                if ldebug: filename = 'test_' + filename
                if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1])
                else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1])
                tmpfilename = tmppfx + filename
                assert os.path.exists(expfolder)
                filepath = expfolder + filename
                tmpfilepath = expfolder + tmpfilename
                lskip = False  # else just go ahead
                if os.path.exists(filepath):
                    if not loverwrite:
                        age = datetime.fromtimestamp(
                            os.path.getmtime(filepath))
                        # if sink file is newer than source file, skip (do not recompute)
                        if age > sourceage and os.path.getsize(filepath) > 1e6:
                            lskip = True
                        # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash
                        #print sourceage, age
                    if not lskip: os.remove(filepath)

                # depending on last modification time of file or overwrite setting, start computation, or skip
                if lskip:
                    # print message
                    skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
                        pidstr, filename, dataset_name)
                    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(
                        pidstr, filepath)
                    logger.info(skipmsg)
                else:

                    if griddef is None: lregrid = False
                    else: lregrid = True

                    ## begin actual computation
                    beginmsg = "\n{:s}   <<<   Computing '{:s}' (d{:02d}) Climatology from {:s}".format(
                        pidstr, dataset_name, domain, periodstr)
                    if not lregrid: beginmsg += "  >>>   \n"
                    else:
                        beginmsg += " ('{:s}' grid)  >>>   \n".format(
                            griddef.name)
                    logger.info(beginmsg)

                    ## actually load datasets
                    if source is None:
                        source = loadWRF_TS(
                            experiment=experiment,
                            filetypes=[filetype],
                            domains=domain)  # comes out as a tuple...
                    if not lparallel and ldebug:
                        logger.info('\n' + str(source) + '\n')

                    # prepare sink
                    if os.path.exists(tmpfilepath):
                        os.remove(tmpfilepath)  # remove old temp files
                    sink = DatasetNetCDF(name='WRF Climatology',
                                         folder=expfolder,
                                         filelist=[tmpfilename],
                                         atts=source.atts.copy(),
                                         mode='w')
                    sink.atts.period = periodstr
                    #           if lregrid: addGDALtoDataset(sink, griddef=griddef)

                    # initialize processing
                    CPU = CentralProcessingUnit(
                        source,
                        sink,
                        varlist=varlist,
                        tmp=lregrid,
                        feedback=ldebug)  # no need for lat/lon

                    # start processing climatology
                    if shift != 0:
                        logger.info(
                            '{0:s}   (shifting climatology by {1:d} month, to start with January)   \n'
                            .format(pidstr, shift))
                    CPU.Climatology(period=period,
                                    offset=offset,
                                    shift=shift,
                                    flush=False)
                    # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller!

                    # reproject and resample (regrid) dataset
                    if lregrid:
                        CPU.Regrid(griddef=griddef, flush=True)
                        logger.info('{:s}   ---   {:s}   ---   \n'.format(
                            pidstr, griddef.name))
                        logger.debug('{:s}   ---   {:s}   ---   \n'.format(
                            pidstr, str(griddef)))

                    # sync temporary storage with output dataset (sink)
                    CPU.sync(flush=True)

                    # add Geopotential Height Variance
                    if 'GHT_Var' in sink and 'Z_var' not in sink:
                        data_array = (sink['GHT_Var'].data_array -
                                      sink['Z'].data_array**2)**0.5
                        atts = dict(
                            name='Z_var',
                            units='m',
                            long_name=
                            'Square Root of Geopotential Height Variance')
                        sink += Variable(axes=sink['Z'].axes,
                                         data=data_array,
                                         atts=atts)

                    # add (relative) Vorticity Variance
                    if 'Vorticity_Var' in sink and 'zeta_var' not in sink:
                        data_array = (sink['Vorticity_Var'].data_array -
                                      sink['zeta'].data_array**2)**0.5
                        atts = dict(
                            name='zeta_var',
                            units='1/s',
                            long_name=
                            'Square Root of Relative Vorticity Variance')
                        sink += Variable(axes=sink['zeta'].axes,
                                         data=data_array,
                                         atts=atts)

                    # add names and length of months
                    sink.axisAnnotation('name_of_month',
                                        name_of_month,
                                        'time',
                                        atts=dict(
                                            name='name_of_month',
                                            units='',
                                            long_name='Name of the Month'))
                    if not sink.hasVariable('length_of_month'):
                        sink += Variable(name='length_of_month',
                                         units='days',
                                         axes=(sink.time, ),
                                         data=days_per_month,
                                         atts=dict(
                                             name='length_of_month',
                                             units='days',
                                             long_name='Length of Month'))

                    # close... and write results to file
                    sink.sync()
                    sink.close()
                    writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                        pidstr, filename, dataset_name)
                    writemsg += "\n{:s}   >>>   ('{:s}')\n".format(
                        pidstr, filepath)
                    logger.info(writemsg)
                    # rename file to proper name
                    if os.path.exists(filepath):
                        os.remove(filepath)  # remove old file
                    os.rename(tmpfilepath,
                              filepath)  # this will overwrite the old file

                    # print dataset
                    if not lparallel and ldebug:
                        logger.info('\n' + str(sink) + '\n')

                    # clean up (not sure if this is necessary, but there seems to be a memory leak...
                    del sink, CPU
                    gc.collect()  # get rid of these guys immediately

        # clean up and return
        if source is not None:
            source.unload()
            del source
        # N.B.: source is only loaded once for all periods

    # N.B.: garbage is collected in multi-processing wrapper as well
    # return
    return 0  # so far, there is no measure of success, hence, if there is no crash...