def getTargetFile(dataset=None, mode=None, dataargs=None, grid=None, shape=None, station=None, period=None, filetype=None, lwrite=True): ''' generate filename for target dataset ''' # for CESM & WRF if grid is None: grid = dataargs.gridstr # also use grid for station/shape type if period is None: period = dataargs.periodstr if dataset in ('WRF', 'CESM') and lwrite: # prepare some variables domain = dataargs.domain if filetype is None: filetype = dataargs.filetype gstr = '_{}'.format(grid) if grid else '' # prepend shape or station type before grid if shape and station: raise ArgumentError elif shape: gstr = '_{}{}'.format(shape, gstr) elif station: gstr = '_{}{}'.format(station, gstr) pstr = '_{}'.format(period) if period else '' if dataset == 'WRF': import datasets.WRF as WRF fileclass = WRF.fileclasses[ filetype] if filetype in WRF.fileclasses else WRF.FileType( filetype) if mode == 'climatology': filename = fileclass.climfile.format(domain, gstr, pstr) elif mode == 'time-series': filename = fileclass.tsfile.format(domain, gstr) elif dataset == 'CESM': import datasets.CESM as CESM fileclass = CESM.fileclasses[ filetype] if filetype in CESM.fileclasses else CESM.FileType( filetype) if mode == 'climatology': filename = fileclass.climfile.format(gstr, pstr) elif mode == 'time-series': filename = fileclass.tsfile.format(gstr) else: raise NotImplementedError, "Unsupported Mode: '{:s}'".format(mode) elif lwrite: # assume observational datasets filename = getFileName(grid=grid, shape=shape, station=station, period=period, name=dataargs.obs_res, filetype=mode) else: raise DatasetError(dataset) if not os.path.exists(dataargs.avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( dataargs.avgfolder) # return filename return filename
def loadUnity(name=dataset_name, period=None, grid=None, varlist=None, varatts=None, folder=avgfolder, filelist=None, lautoregrid=False, resolution=None, unity_grid=None): ''' Get the pre-processed, unified monthly climatology as a DatasetNetCDF. ''' #if lautoregrid: warn("Auto-regridding is currently not available for the unified dataset - use the generator routine instead.") # a climatology is not available if period is None: period = (1979, 2009) warn( 'A climatology is not available for the Unified Dataset; loading period {0:4d}-{1:4d}.' .format(*period)) # this dataset has not native/default grid if grid is None: if unity_grid is None: raise DatasetError( "The Unified Dataset has no native grid; need to define a default grid 'unity_grid'." ) grid = unity_grid warn('The Unified Dataset has no native grid; loading {0:s} grid.'. format(grid)) # load standardized climatology dataset with PRISM-specific parameters dataset = loadSpecialObs(name=name, folder=folder, period=period, grid=grid, shape=None, station=None, varlist=varlist, varatts=varatts, filepattern=avgfile, filelist=filelist, projection=None, mode='climatology', lautoregrid=False) # return formatted dataset return dataset
def loadHGS_StnTS(station=None, varlist=None, varatts=None, folder=None, name=None, title=None, start_date=None, end_date=None, run_period=15, period=None, lskipNaN=False, lcheckComplete=True, basin=None, WSC_station=None, basin_list=None, filename=None, prefix=None, scalefactors=None, **kwargs): ''' Get a properly formatted WRF dataset with monthly time-series at station locations; as in the hgsrun module, the capitalized kwargs can be used to construct folders and/or names ''' if folder is None or ( filename is None and station is None ): raise ArgumentError # try to find meta data for gage station from WSC HGS_station = station if basin is not None and basin_list is not None: station_name = station station = getGageStation(basin=basin, station=station if WSC_station is None else WSC_station, basin_list=basin_list) # only works with registered basins if station_name is None: station_name = station.name # backup, in case we don't have a HGS station name metadata = station.getMetaData() # load station meta data if metadata is None: raise GageStationError(name) else: metadata = dict(); station = None; station_name = None # prepare name expansion arguments (all capitalized) expargs = dict(ROOT_FOLDER=root_folder, STATION=HGS_station, NAME=name, TITLE=title, PREFIX=prefix, BASIN=basin, WSC_STATION=WSC_station) for key,value in metadata.items(): if isinstance(value,basestring): expargs['WSC_'+key.upper()] = value # in particular, this includes WSC_ID if 'WSC_ID' in expargs: if expargs['WSC_ID'][0] == '0': expargs['WSC_ID0'] = expargs['WSC_ID'][1:] else: raise DatasetError('Expected leading zero in WSC station ID: {}'.format(expargs['WSC_ID'])) # exparg preset keys will get overwritten if capitalized versions are defined for key,value in kwargs.items(): KEY = key.upper() # we only use capitalized keywords, and non-capitalized keywords are only used/converted if KEY == key or KEY not in kwargs: expargs[KEY] = value # if no capitalized version is defined # read folder and infer prefix, if necessary folder = folder.format(**expargs) if not os.path.exists(folder): raise IOError(folder) if expargs['PREFIX'] is None: with open('{}/{}'.format(folder,prefix_file), 'r') as pfx: expargs['PREFIX'] = prefix = ''.join(pfx.readlines()).strip() # now assemble file name for station timeseries filename = filename.format(**expargs) filepath = '{}/{}'.format(folder,filename) if not os.path.exists(filepath): IOError(filepath) if station_name is None: station_name = filename[filename.index('hydrograph.')+1:-4] if station is None else station # set meta data (and allow keyword expansion of name and title) metadata['problem'] = prefix metadata['station_name'] = metadata.get('long_name', station_name) if name is not None: name = name.format(**expargs) # name expansion with capitalized keyword arguments else: name = 'HGS_{:s}'.format(station_name) metadata['name'] = name; expargs['Name'] = name.title() # name in title format if title is None: title = '{{Name:s}} (HGS, {problem:s})'.format(**metadata) title = title.format(**expargs) # name expansion with capitalized keyword arguments metadata['long_name'] = metadata['title'] = title # now determine start data for date_parser if end_date is None: if start_date and run_period: end_date = start_date + run_period elif period: end_date = period[1] else: raise ArgumentError("Need to specify either 'start_date' & 'run_period' or 'period' to infer 'end_date'.") end_year,end_month,end_day = convertDate(end_date) if start_date is None: if end_date and run_period: start_date = end_date - run_period elif period: start_date = period[0] else: raise ArgumentError("Need to specify either 'end_date' & 'run_period' or 'period' to infer 'start_date'.") start_year,start_month,start_day = convertDate(start_date) if start_day != 1 or end_day != 1: raise NotImplementedError('Currently only monthly data is supported.') # import functools # date_parser = functools.partial(date_parser, year=start_year, month=start_month, day=start_day) # # now load data using pandas ascii reader # data_frame = pd.read_table(filepath, sep='\s+', header=2, dtype=np.float64, index_col=['time'], # date_parser=date_parser, names=ascii_varlist) # # resample to monthly data # data_frame = data_frame.resample(resampling).agg(np.mean) # data = data_frame[flowvar].values # parse header if varlist is None: varlist = variable_list[:] # default list with open(filepath, 'r') as f: line = f.readline(); lline = line.lower() # 1st line if not "hydrograph" in lline: raise GageStationError(line,filepath) # parse variables and determine columns line = f.readline(); lline = line.lower() # 2nd line if not "variables" in lline: raise GageStationError(line) variable_order = [v.strip('"').lower() for v in line[line.find('"'):].strip().split(',')] # figure out varlist and data columns if variable_order[0] == 'time': del variable_order[0] # only keep variables else: raise GageStationError(variable_order) variable_order = [hgs_variables[v] for v in variable_order] # replace HGS names with GeoPy names vardict = {v:i+1 for i,v in enumerate(variable_order)} # column mapping; +1 because time was removed variable_order = [v for v in variable_order if v in varlist or flow_to_flux[v] in varlist] usecols = tuple(vardict[v] for v in variable_order) # variable columns that need to loaded (except time, which is col 0) assert 0 not in usecols, usecols # load data as tab separated values data = np.genfromtxt(filepath, dtype=np.float64, delimiter=None, skip_header=3, usecols = (0,)+usecols) assert data.shape[1] == len(usecols)+1, data.shape if lskipNaN: data = data[np.isnan(data).sum(axis=1)==0,:] elif np.any( np.isnan(data) ): raise DataError("Missing values (NaN) encountered in hydrograph file; use 'lskipNaN' to ignore.\n('{:s}')".format(filepath)) time_series = data[:,0]; flow_data = data[:,1:] assert flow_data.shape == (len(time_series),len(usecols)), flow_data.shape # original time deltas in seconds time_diff = time_series.copy(); time_diff[1:] = np.diff(time_series) # time period between time steps assert np.all( time_diff > 0 ), filepath time_diff = time_diff.reshape((len(time_diff),1)) # reshape to make sure broadcasting works # integrate flow over time steps before resampling flow_data[1:,:] -= np.diff(flow_data, axis=0)/2. # get average flow between time steps flow_data *= time_diff # integrate flow in time interval by multiplying average flow with time period flow_data = np.cumsum(flow_data, axis=0) # integrate by summing up total flow per time interval # generate regular monthly time steps start_datetime = np.datetime64(dt.datetime(year=start_year, month=start_month, day=start_day), 'M') end_datetime = np.datetime64(dt.datetime(year=end_year, month=end_month, day=end_day), 'M') time_monthly = np.arange(start_datetime, end_datetime+np.timedelta64(1, 'M'), dtype='datetime64[M]') assert time_monthly[0] == start_datetime, time_monthly[0] assert time_monthly[-1] == end_datetime, time_monthly[-1] # convert monthly time series to regular array of seconds since start date time_monthly = ( time_monthly.astype('datetime64[s]') - start_datetime.astype('datetime64[s]') ) / np.timedelta64(1,'s') assert time_monthly[0] == 0, time_monthly[0] # interpolate integrated flow to new time axis #flow_data = np.interp(time_monthly, xp=time_series[:,0], fp=flow_data[:,0],).reshape((len(time_monthly),1)) time_series = np.concatenate(([0],time_series), axis=0) # integrated flow at time zero must be zero... flow_data = np.concatenate(([[0,]*len(usecols)],flow_data), axis=0) # ... this is probably better than interpolation # N.B.: we are adding zeros here so we don't have to extrapolate to the left; on the right we just fill in NaN's if ( time_monthly[-1] - time_series[-1] ) > 3*86400. and lcheckComplete: warn("Data record ends more than 3 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) elif (time_monthly[-1]-time_series[-1]) > 5*86400.: if lcheckComplete: raise DataError("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) else: warn("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) flow_interp = si.interp1d(x=time_series, y=flow_data, kind='linear', axis=0, copy=False, bounds_error=False, fill_value=np.NaN, assume_sorted=True) flow_data = flow_interp(time_monthly) # evaluate with call # compute monthly flow rate from interpolated integrated flow flow_data = np.diff(flow_data, axis=0) / np.diff(time_monthly, axis=0).reshape((len(time_monthly)-1,1)) flow_data *= 1000 # convert from m^3/s to kg/s # construct time axis start_time = 12*(start_year - 1979) + start_month -1 end_time = 12*(end_year - 1979) + end_month -1 time = Axis(name='time', units='month', atts=dict(long_name='Month since 1979-01'), coord=np.arange(start_time, end_time)) # not including the last, e.g. 1979-01 to 1980-01 is 12 month assert len(time_monthly) == end_time-start_time+1 assert flow_data.shape == (len(time),len(variable_order)), (flow_data.shape,len(time),len(variable_order)) # construct dataset dataset = Dataset(atts=metadata) dataset.station = station # add gage station object, if available (else None) for i,flowvar in enumerate(variable_order): data = flow_data[:,i] fluxvar = flow_to_flux[flowvar] if flowvar in varlist: flowatts = variable_attributes[flowvar] # convert variables and put into dataset (monthly time series) if flowatts['units'] != 'kg/s': raise VariableError("Hydrograph data is read as kg/s; flow variable does not match.\n{}".format(flowatts)) dataset += Variable(data=data, axes=(time,), **flowatts) if fluxvar in varlist and 'shp_area' in metadata: # compute surface flux variable based on drainage area fluxatts = variable_attributes[fluxvar] if fluxatts['units'] == 'kg/s' and fluxatts['units'] != 'kg/m^2/s': raise VariableError(fluxatts) data = data / metadata['shp_area'] # need to make a copy dataset += Variable(data=data, axes=(time,), **fluxatts) # apply analysis period if period is not None: dataset = dataset(years=period) # adjust scalefactors, if necessary if scalefactors: if isinstance(scalefactors,dict): dataset = updateScalefactor(dataset, varlist=scalefactors, scalefactor=None) elif isNumber(scalefactors): scalelist = ('discharge','seepage','flow') dataset = updateScalefactor(dataset, varlist=scalelist, scalefactor=scalefactors) else: raise TypeError(scalefactors) # return completed dataset return dataset
def getMetaData(dataset, mode, dataargs, lone=True): ''' determine dataset type and meta data, as well as path to main source file ''' # determine dataset mode lclim = False lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True elif mode[-5:] == '-mean': lclim = True mode = 'climatology' # only for export to seasonal means (load entire monthly climatology) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # general arguments (dataset independent) varlist = dataargs.get('varlist', None) resolution = dataargs.get('resolution', None) grid = dataargs.get('grid', None) # get grid period = dataargs.get('period', None) # determine meta data based on dataset type if dataset == 'WRF': import datasets.WRF as WRF # WRF datasets obs_res = None # only for datasets (not used here) exp = dataargs['experiment'] # need that one dataset_name = exp.name avgfolder = exp.avgfolder filetypes = dataargs['filetypes'] fileclasses = WRF.fileclasses.copy() for filetype in filetypes: if filetype not in fileclasses: fileclasses[filetype] = WRF.FileType(filetype) domain = dataargs.get('domain', None) periodstr, gridstr = getPeriodGridString(period, grid, exp=exp) # check arguments if period is None and lclim: raise DatasetError, "A 'period' argument is required to load climatologies!" if lone and len(filetypes) > 1: raise DatasetError # process only one file at a time if not isinstance(domain, (np.integer, int)): raise DatasetError # construct dataset message if lone: datamsgstr = "Processing WRF '{:s}'-file from Experiment '{:s}' (d{:02d})".format( filetypes[0], dataset_name, domain) else: datamsgstr = "Processing WRF dataset from Experiment '{:s}' (d{:02d})".format( dataset_name, domain) # figure out age of source file(s) srcage = getSourceAge(fileclasses=fileclasses, filetypes=filetypes, exp=exp, domain=domain, periodstr=periodstr, gridstr=gridstr, lclim=lclim, lts=lts) # load source data if lclim: loadfct = partial(WRF.loadWRF, experiment=exp, name=None, domains=domain, grid=grid, varlist=varlist, period=period, filetypes=filetypes, varatts=None, lconst=True, ltrimT=False) # still want topography... elif lts: loadfct = partial(WRF.loadWRF_TS, experiment=exp, name=None, domains=domain, grid=grid, varlist=varlist, filetypes=filetypes, varatts=None, lconst=True, ltrimT=False) # still want topography... elif dataset == 'CESM': import datasets.CESM as CESM # CESM datasets obs_res = None # only for datasets (not used here) domain = None # only for WRF exp = dataargs['experiment'] avgfolder = exp.avgfolder dataset_name = exp.name periodstr, gridstr = getPeriodGridString(period, grid, exp=exp) filetypes = dataargs['filetypes'] fileclasses = CESM.fileclasses.copy() for filetype in filetypes: if filetype not in fileclasses: fileclasses[filetype] = CESM.FileType(filetype) # check arguments if period is None and lclim: raise DatasetError, "A 'period' argument is required to load climatologies!" if lone and len(filetypes) > 1: raise DatasetError # process only one file at a time # construct dataset message if lone: datamsgstr = "Processing CESM '{:s}'-file from Experiment '{:s}'".format( filetypes[0], dataset_name) else: datamsgstr = "Processing CESM dataset from Experiment '{:s}'".format( dataset_name) # figure out age of source file(s) srcage = getSourceAge(fileclasses=fileclasses, filetypes=filetypes, exp=exp, domain=None, periodstr=periodstr, gridstr=gridstr, lclim=lclim, lts=lts) # load source data load3D = dataargs.pop( 'load3D', None) # if 3D fields should be loaded (default: False) if lclim: loadfct = partial(CESM.loadCESM, experiment=exp, name=None, grid=grid, period=period, varlist=varlist, filetypes=filetypes, varatts=None, load3D=load3D, translateVars=None) elif lts: loadfct = partial(CESM.loadCESM_TS, experiment=exp, name=None, grid=grid, varlist=varlist, filetypes=filetypes, varatts=None, load3D=load3D, translateVars=None) else: # assume observational datasets filetypes = [None] # only for CESM & WRF domain = None # only for WRF try: module = import_module('datasets.{0:s}'.format(dataset)) except ImportError: raise DatasetError( "Error loading dataset module '{:s}' from 'datasets' package!". format(dataset)) dataset_name = module.dataset_name resolution = dataargs['resolution'] if resolution: obs_res = '{0:s}_{1:s}'.format(dataset_name, resolution) else: obs_res = dataset_name # figure out period periodstr, gridstr = getPeriodGridString(period, grid, beginyear=1979) if period is None and lclim: periodstr = 'LTM' datamsgstr = "Processing Dataset '{:s}'".format(dataset_name) # assemble filename to check modification dates (should be only one file) filename = getFileName(grid=grid, period=period, name=obs_res, filetype=mode) avgfolder = module.avgfolder filepath = '{:s}/{:s}'.format(avgfolder, filename) # load pre-processed climatology kwargs = dict(name=dataset_name, grid=grid, varlist=varlist, resolution=resolution, varatts=None) if dataset == 'Unity': kwargs['unity_grid'] = dataargs['unity_grid'] if lclim and module.loadClimatology is not None: loadfct = partial(module.loadClimatology, period=period, **kwargs) elif lts and module.loadTimeSeries is not None: loadfct = partial(module.loadTimeSeries, **kwargs) else: raise DatasetError( "Unable to identify time aggregation mode; the dataset " + "'{}' may not support selected mode '{}'.".format( dataset, mode)) # check if the source file is actually correct if os.path.exists(filepath): filelist = [filepath] else: source = loadfct( ) # don't load dataset, just construct the file list filelist = source.filelist # figure out age of source file(s) srcage = getSourceAge(filelist=filelist, lclim=lclim, lts=lts) # N.B.: it would be nice to print a message, but then we would have to make the logger available, # which would be too much trouble ## assemble and return meta data dataargs = namedTuple(dataset_name=dataset_name, period=period, periodstr=periodstr, avgfolder=avgfolder, filetypes=filetypes, filetype=filetypes[0], domain=domain, obs_res=obs_res, varlist=varlist, grid=grid, gridstr=gridstr, resolution=resolution) # return meta data return dataargs, loadfct, srcage, datamsgstr
if isinstance(periods, (np.integer, int)): periods = [periods] # check and expand WRF experiment list WRF_experiments = getExperimentList(WRF_experiments, WRF_project, 'WRF') if isinstance(domains, (np.integer, int)): domains = [domains] # check and expand CESM experiment list CESM_experiments = getExperimentList(CESM_experiments, CESM_project, 'CESM') # expand datasets and resolutions if datasets is None: datasets = gridded_datasets if unity_grid is None and 'Unity' in datasets: if WRF_project: unity_grid = import_module( 'projects.{:s}'.format(WRF_project)).unity_grid else: raise DatasetError( "Dataset 'Unity' has no native grid - please set 'unity_grid'." ) # print an announcement if len(WRF_experiments) > 0: print('\n Regridding WRF Datasets:') print([exp.name for exp in WRF_experiments]) if len(CESM_experiments) > 0: print('\n Regridding CESM Datasets:') print([exp.name for exp in CESM_experiments]) if len(datasets) > 0: print('\n And Observational Datasets:') print(datasets) print('\n To Grid and Resolution:') for grid, reses in grids.iteritems(): print(' {0:s} {1:s}'.format(grid, printList(reses) if reses else ''))