def __init__(self, path_db, tair_var): ''' Parameters ---------- path_db : str File path to a serially complete netCDF station database containing the stations and temperature variable for interpolation. tair_var : str The temperature variable for interpolation ('tmin' or 'tmax') ''' stn_da = StationSerialDataDb(path_db, tair_var, vcc_size=470560000 * 2) mask_stns = np.isnan(stn_da.stns[BAD]) stn_slct = StationSelect(stn_da, stn_mask=mask_stns, rm_zero_dist_stns=True) krig_tair = KrigTair(stn_slct) gwr_tair = GwrTairAnom(stn_slct) interp_tair = InterpTair(krig_tair, gwr_tair) self.stn_da = stn_da self.interp_tair = interp_tair self.mth_masks = stn_da.mth_idx
def __init__(self, stn_da): ''' Parameters ---------- stnda : twx.db.StationSerialDataDb A StationSerialDataDb object pointing to the database from which observations will be loaded. ''' self.stn_da = stn_da mask_stns = np.isnan(self.stn_da.stns[BAD]) self.stn_slct = StationSelect(self.stn_da, stn_mask=mask_stns, rm_zero_dist_stns=True) self.vnames_norm = [get_norm_varname(mth) for mth in np.arange(1, 13)] self.vnames_lst = [get_lst_varname(mth) for mth in np.arange(1, 13)] self.df_stns = pd.DataFrame(self.stn_da.stns) self.df_stns.index = self.df_stns[STN_ID] # Calculate annual means for monthly LST and Tair normals self.df_stns['lst'] = self.df_stns[self.vnames_lst].mean(axis=1) self.df_stns['norm'] = self.df_stns[self.vnames_norm].mean(axis=1)
def __init__(self, path_db, tair_var): ''' Parameters ---------- path_db : str File path to a serially complete netCDF station database containing the stations and temperature variable for interpolation. tair_var : str The temperature variable for interpolation ('tmin' or 'tmax') ''' stn_da = StationSerialDataDb(path_db, tair_var) mask_stns = np.isnan(stn_da.stns[BAD]) stn_slct = StationSelect(stn_da, stn_mask=mask_stns, rm_zero_dist_stns=True) self.krig = KrigTairAll(stn_slct) self.stn_da = stn_da
class XvalOutlier(object): ''' Class for running a leave-one-out cross validation of simple geographically weighted regression models of station monthly and annual normals to determine if a station is an outlier and has possible erroneous values based on unrealistic model error. ''' def __init__(self, stn_da): ''' Parameters ---------- stnda : twx.db.StationSerialDataDb A StationSerialDataDb object pointing to the database from which observations will be loaded. ''' self.stn_da = stn_da mask_stns = np.isnan(self.stn_da.stns[BAD]) self.stn_slct = StationSelect(self.stn_da, stn_mask=mask_stns, rm_zero_dist_stns=True) self.vnames_norm = [get_norm_varname(mth) for mth in np.arange(1, 13)] self.vnames_lst = [get_lst_varname(mth) for mth in np.arange(1, 13)] self.df_stns = pd.DataFrame(self.stn_da.stns) self.df_stns.index = self.df_stns[STN_ID] # Calculate annual means for monthly LST and Tair normals self.df_stns['lst'] = self.df_stns[self.vnames_lst].mean(axis=1) self.df_stns['norm'] = self.df_stns[self.vnames_norm].mean(axis=1) def run_xval_stn(self, stn_id, bw_nngh=100): ''' Run a single leave-one-out cross validation of a geographically weighted regression model of a station's monthly and annual normals (norm~lst+elev+lon+lat). Parameters ---------- stn_id : str The stn_id for which to run the cross validation bw_nngh : int, optional The number of neighbors to use for the geographically weighted regression. Default: 100. Returns ---------- err : float The difference between predicted and observed (predicted minus observed) ''' xval_stn = self.stn_da.stns[self.stn_da.stn_idxs[stn_id]] df_xval_stn = self.df_stns.loc[stn_id, :] self.stn_slct.set_ngh_stns(xval_stn[LAT], xval_stn[LON], bw_nngh, load_obs=False, stns_rm=stn_id) df_nghs = self.df_stns.loc[self.stn_slct.ngh_stns[STN_ID], :] errs = np.empty(13) # Errors for monthly normals for mth in np.arange(1, 13): ls_form = 'norm%.2d~lst%.2d+elevation+longitude+latitude' % (mth, mth) ls_fit = sm.wls(ls_form, data=df_nghs, weights=self.stn_slct.ngh_wgt).fit() err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm%.2d' % mth] errs[mth - 1] = err # Error for annual normal ls_form = 'norm~lst+elevation+longitude+latitude' ls_fit = sm.wls(ls_form, data=df_nghs, weights=self.stn_slct.ngh_wgt).fit() err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm'] errs[-1] = err return errs def find_xval_outliers(self, stn_ids=None, bw_nngh=100, zscore_threshold=6): ''' Runs a leave-one-out cross validation of a geographically weighted regression model of station monthly and annual normals (norm~lst+elev+lon+lat) and returns those stations whose error is a specified # of standard deviations above/below the mean Parameters ---------- stn_ids : list_like, optional The station ids for which to run the cross validation. If None, the cross validation will be run for all stations in the database bw_nngh : int, optional The number of neighbors to use for the geographically weighted regression. Default: 100. zscore_threshold : float, optional The zcore threshold by which a station's error should be considered an outlier. Returns ---------- out_stnids : ndarray The outlier stations out_errs : ndarray The model error associated with each outlier ''' if stn_ids is None: stn_ids = self.stn_da.stn_ids schk = StatusCheck(stn_ids.size, check_cnt=250) xval_errs = np.zeros((13, stn_ids.size)) for i, a_id in enumerate(stn_ids): xval_errs[:, i] = self.run_xval_stn(a_id, bw_nngh) schk.increment() xval_errs = pd.DataFrame(xval_errs) xval_errs.columns = stn_ids zscores = (xval_errs.subtract(xval_errs.mean(axis=1), axis=0).divide(xval_errs.std(axis=1), axis=0).abs()) out_stnids = zscores.columns[(zscores > zscore_threshold).any( axis=0)].values return out_stnids
def __init__(self, stn_da_tmin, stn_da_tmax, aux_fpaths=None, interp_orders=None, norms_only=False): ''' Parameters ---------- stn_da_tmin : twx.db.StationSerialDataDb A StationSerialDataDb object pointing to the database from which Tmin observations should be loaded. stn_da_tmax : twx.db.StationSerialDataDb A StationSerialDataDb object pointing to the database from which Tmax observations should be loaded. ''' self.days = stn_da_tmin.days self.stn_da_tmin = stn_da_tmin self.stn_da_tmax = stn_da_tmax # Masks for calculating monthly norms after daily Tmin/Tmax values # had to be adjusted due to Tmin >= Tmax self.daysNormMask = np.nonzero( np.logical_and(self.days[YEAR] >= 1981, self.days[YEAR] <= 2010))[0] daysNorm = self.days[self.daysNormMask] uYrs = np.unique(daysNorm[YEAR]) self.yr_mths = get_mth_metadata(uYrs[0], uYrs[-1]) self.yrMthsMasks = [] for aYr in uYrs: for aMth in np.arange(1, 13): self.yrMthsMasks.append( np.nonzero( np.logical_and(daysNorm[YEAR] == aYr, daysNorm[MONTH] == aMth))[0]) self.mth_masks = [] for mth in np.arange(1, 13): self.mth_masks.append(np.nonzero(self.yr_mths[MONTH] == mth)[0]) mask_stns_tmin = np.isnan(stn_da_tmin.stns[BAD]) mask_stns_tmax = np.isnan(stn_da_tmax.stns[BAD]) stn_slct_tmin = StationSelect(stn_da_tmin, mask_stns_tmin) stn_slct_tmax = StationSelect(stn_da_tmax, mask_stns_tmax) domain_stns_tmin = stn_da_tmin.stns[np.logical_and( mask_stns_tmin, np.isfinite(stn_da_tmin.stns[MASK]))] domain_stns_tmax = stn_da_tmax.stns[np.logical_and( mask_stns_tmax, np.isfinite(stn_da_tmax.stns[MASK]))] self.nnghparams_tmin = _get_rgn_nnghs_dict(domain_stns_tmin) self.nnghparams_tmax = _get_rgn_nnghs_dict(domain_stns_tmax) krig_tmin = KrigTair(stn_slct_tmin) krig_tmax = KrigTair(stn_slct_tmax) if norms_only: gwr_tmin = GwrTairAnomBlank(stn_slct_tmin) gwr_tmax = GwrTairAnomBlank(stn_slct_tmax) else: gwr_tmin = GwrTairAnom(stn_slct_tmin) gwr_tmax = GwrTairAnom(stn_slct_tmax) self.interp_tmin = InterpTair(krig_tmin, gwr_tmin) self.interp_tmax = InterpTair(krig_tmax, gwr_tmax) if aux_fpaths is not None: self.pGrids = PredictorGrids(aux_fpaths, interp_orders) self.a_pt = build_empty_pt()