Example #1
0
    print('\n')
    # prepare sink
    gridstr = '' if grid is 'NARR' else '_'+grid
    filename = avgfile.format(gridstr,'_'+periodstr)
    if os.path.exists(avgfolder+filename): os.remove(avgfolder+filename)
    sink = DatasetNetCDF(name='NARR Climatology', folder=avgfolder, filelist=[filename], atts=source.atts, mode='w')
    sink.atts.period = periodstr 
    
    # determine averaging interval
    offset = source.time.getIndex(period[0]-1979)/12 # origin of monthly time-series is at January 1979 
    # initialize processing
#     CPU = CentralProcessingUnit(source, sink, varlist=['precip', 'T2'], tmp=True) # no need for lat/lon
    CPU = CentralProcessingUnit(source, sink, varlist=None, tmp=True) # no need for lat/lon
    
    # start processing climatology
    CPU.Climatology(period=period[1]-period[0], offset=offset, flush=False)
    
    # sync temporary storage with output
    CPU.sync(flush=True)

#     # make new masks
#     sink.mask(sink.landmask, maskSelf=False, varlist=['snow','snowh','zs'], invert=True, merge=False)

    # add names and length of months
    sink.axisAnnotation('name_of_month', name_of_month, 'time', 
                        atts=dict(name='name_of_month', units='', long_name='Name of the Month'))
    #print '   ===   month   ===   '
#     sink += VarNC(sink.dataset, name='length_of_month', units='days', axes=(sink.time,), data=days_per_month,
#                   atts=dict(name='length_of_month',units='days',long_name='Length of Month'))
    
    # close...
Example #2
0
def computeClimatology(experiment,
                       filetype,
                       domain,
                       periods=None,
                       offset=0,
                       griddef=None,
                       varlist=None,
                       ldebug=False,
                       loverwrite=False,
                       lparallel=False,
                       pidstr='',
                       logger=None):
    ''' worker function to compute climatologies for given file parameters. '''
    # input type checks
    if not isinstance(experiment, Exp): raise TypeError
    if not isinstance(filetype, basestring): raise TypeError
    if not isinstance(domain, (np.integer, int)): raise TypeError
    if periods is not None and not (isinstance(periods, (tuple, list))
                                    and isInt(periods)):
        raise TypeError
    if not isinstance(offset, (np.integer, int)): raise TypeError
    if not isinstance(loverwrite, (bool, np.bool)): raise TypeError
    if griddef is not None and not isinstance(griddef, GridDefinition):
        raise TypeError

    #if pidstr == '[proc01]': raise TypeError # to test error handling

    # load source
    dataset_name = experiment.name
    fileclass = fileclasses[filetype]  # used for target file name
    tsfile = fileclass.tsfile.format(domain, '')
    expfolder = experiment.avgfolder
    filepath = '{:s}/{:s}'.format(expfolder, tsfile)
    logger.info('\n\n{0:s}   ***   Processing Experiment {1:<15s}   ***   '.
                format(pidstr, "'{:s}'".format(dataset_name)) +
                '\n{0:s}   ***   {1:^37s}   ***   \n'.format(
                    pidstr, "'{:s}'".format(tsfile)))

    # check file and read begin/enddates
    if not os.path.exists(filepath):
        #raise IOError, "Source file '{:s}' does not exist!".format(filepath)
        # print message and skip
        skipmsg = "\n{:s}   >>>   File '{:s}' in dataset '{:s}' is missing --- skipping!".format(
            pidstr, tsfile, dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr, filepath)
        logger.warning(skipmsg)
        # N.B.: this can cause a lot of error messages, when not all files are present
    else:  # if monthly source file exists
        import netCDF4 as nc
        ncfile = nc.Dataset(filepath, mode='r')
        begintuple = ncfile.begin_date.split('-')
        endtuple = ncfile.end_date.split('-')
        ncfile.close()
        # N.B.: at this point we don't want to initialize a full GDAL-enabled dataset, since we don't even
        #       know if we need it, and it creates a lot of overhead

        # determine age of source file
        if not loverwrite:
            sourceage = datetime.fromtimestamp(os.path.getmtime(filepath))

        # figure out start date
        filebegin = int(begintuple[0])  # first element is the year
        fileend = int(endtuple[0])  # first element is the year
        begindate = offset + filebegin
        if not (filebegin <= begindate <= fileend): raise DateError
        # handle cases where the first month in the record is not January
        firstmonth = int(begintuple[1])  # second element is the month
        shift = firstmonth - 1  # will be zero for January (01)

        ## loop over periods
        if periods is None: periods = [begindate - fileend]
        #   periods.sort(reverse=True) # reverse, so that largest chunk is done first
        source = None  # will later be assigned to the source dataset
        for period in periods:

            # figure out period
            enddate = begindate + period
            if filebegin > enddate:
                raise DateError, 'End date earlier than begin date.'
            if enddate - 1 > fileend:  # if filebegin is 1979 and the simulation is 10 years, fileend will be 1988, not 1989!
                # if end date is not available, skip period
                endmsg = "\n{:s}   ---   Invalid Period for '{:s}': End Date {:4d} not in File!   ---   \n".format(
                    pidstr, dataset_name, enddate)
                endmsg += "{:s}   ---   ('{:s}')\n".format(pidstr, filepath)
                logger.info(endmsg)

            else:  ## perform averaging for selected period

                # determine if sink file already exists, and what to do about it
                periodstr = '{0:4d}-{1:4d}'.format(begindate, enddate)
                gridstr = '' if griddef is None or griddef.name is 'WRF' else '_' + griddef.name
                filename = fileclass.climfile.format(domain, gridstr,
                                                     '_' + periodstr)
                if ldebug: filename = 'test_' + filename
                if lparallel: tmppfx = 'tmp_wrfavg_{:s}_'.format(pidstr[1:-1])
                else: tmppfx = 'tmp_wrfavg_'.format(pidstr[1:-1])
                tmpfilename = tmppfx + filename
                assert os.path.exists(expfolder)
                filepath = expfolder + filename
                tmpfilepath = expfolder + tmpfilename
                lskip = False  # else just go ahead
                if os.path.exists(filepath):
                    if not loverwrite:
                        age = datetime.fromtimestamp(
                            os.path.getmtime(filepath))
                        # if sink file is newer than source file, skip (do not recompute)
                        if age > sourceage and os.path.getsize(filepath) > 1e6:
                            lskip = True
                        # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crash
                        #print sourceage, age
                    if not lskip: os.remove(filepath)

                # depending on last modification time of file or overwrite setting, start computation, or skip
                if lskip:
                    # print message
                    skipmsg = "\n{:s}   >>>   Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(
                        pidstr, filename, dataset_name)
                    skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(
                        pidstr, filepath)
                    logger.info(skipmsg)
                else:

                    if griddef is None: lregrid = False
                    else: lregrid = True

                    ## begin actual computation
                    beginmsg = "\n{:s}   <<<   Computing '{:s}' (d{:02d}) Climatology from {:s}".format(
                        pidstr, dataset_name, domain, periodstr)
                    if not lregrid: beginmsg += "  >>>   \n"
                    else:
                        beginmsg += " ('{:s}' grid)  >>>   \n".format(
                            griddef.name)
                    logger.info(beginmsg)

                    ## actually load datasets
                    if source is None:
                        source = loadWRF_TS(
                            experiment=experiment,
                            filetypes=[filetype],
                            domains=domain)  # comes out as a tuple...
                    if not lparallel and ldebug:
                        logger.info('\n' + str(source) + '\n')

                    # prepare sink
                    if os.path.exists(tmpfilepath):
                        os.remove(tmpfilepath)  # remove old temp files
                    sink = DatasetNetCDF(name='WRF Climatology',
                                         folder=expfolder,
                                         filelist=[tmpfilename],
                                         atts=source.atts.copy(),
                                         mode='w')
                    sink.atts.period = periodstr
                    #           if lregrid: addGDALtoDataset(sink, griddef=griddef)

                    # initialize processing
                    CPU = CentralProcessingUnit(
                        source,
                        sink,
                        varlist=varlist,
                        tmp=lregrid,
                        feedback=ldebug)  # no need for lat/lon

                    # start processing climatology
                    if shift != 0:
                        logger.info(
                            '{0:s}   (shifting climatology by {1:d} month, to start with January)   \n'
                            .format(pidstr, shift))
                    CPU.Climatology(period=period,
                                    offset=offset,
                                    shift=shift,
                                    flush=False)
                    # N.B.: immediate flushing should not be necessary for climatologies, since they are much smaller!

                    # reproject and resample (regrid) dataset
                    if lregrid:
                        CPU.Regrid(griddef=griddef, flush=True)
                        logger.info('{:s}   ---   {:s}   ---   \n'.format(
                            pidstr, griddef.name))
                        logger.debug('{:s}   ---   {:s}   ---   \n'.format(
                            pidstr, str(griddef)))

                    # sync temporary storage with output dataset (sink)
                    CPU.sync(flush=True)

                    # add Geopotential Height Variance
                    if 'GHT_Var' in sink and 'Z_var' not in sink:
                        data_array = (sink['GHT_Var'].data_array -
                                      sink['Z'].data_array**2)**0.5
                        atts = dict(
                            name='Z_var',
                            units='m',
                            long_name=
                            'Square Root of Geopotential Height Variance')
                        sink += Variable(axes=sink['Z'].axes,
                                         data=data_array,
                                         atts=atts)

                    # add (relative) Vorticity Variance
                    if 'Vorticity_Var' in sink and 'zeta_var' not in sink:
                        data_array = (sink['Vorticity_Var'].data_array -
                                      sink['zeta'].data_array**2)**0.5
                        atts = dict(
                            name='zeta_var',
                            units='1/s',
                            long_name=
                            'Square Root of Relative Vorticity Variance')
                        sink += Variable(axes=sink['zeta'].axes,
                                         data=data_array,
                                         atts=atts)

                    # add names and length of months
                    sink.axisAnnotation('name_of_month',
                                        name_of_month,
                                        'time',
                                        atts=dict(
                                            name='name_of_month',
                                            units='',
                                            long_name='Name of the Month'))
                    if not sink.hasVariable('length_of_month'):
                        sink += Variable(name='length_of_month',
                                         units='days',
                                         axes=(sink.time, ),
                                         data=days_per_month,
                                         atts=dict(
                                             name='length_of_month',
                                             units='days',
                                             long_name='Length of Month'))

                    # close... and write results to file
                    sink.sync()
                    sink.close()
                    writemsg = "\n{:s}   >>>   Writing to file '{:s}' in dataset {:s}".format(
                        pidstr, filename, dataset_name)
                    writemsg += "\n{:s}   >>>   ('{:s}')\n".format(
                        pidstr, filepath)
                    logger.info(writemsg)
                    # rename file to proper name
                    if os.path.exists(filepath):
                        os.remove(filepath)  # remove old file
                    os.rename(tmpfilepath,
                              filepath)  # this will overwrite the old file

                    # print dataset
                    if not lparallel and ldebug:
                        logger.info('\n' + str(sink) + '\n')

                    # clean up (not sure if this is necessary, but there seems to be a memory leak...
                    del sink, CPU
                    gc.collect()  # get rid of these guys immediately

        # clean up and return
        if source is not None:
            source.unload()
            del source
        # N.B.: source is only loaded once for all periods

    # N.B.: garbage is collected in multi-processing wrapper as well
    # return
    return 0  # so far, there is no measure of success, hence, if there is no crash...