def quantiles_at_breakpoint(data, var, dvar=None, quantilen=None, ibreak=None, sample_size=730, borders=180, verbose=0): """Calculate Quantiles at the breakpoints """ from departures import qmap_departure from support_functions import sample_indices, qstats funcid = '[QAB] ' if not isinstance(var, str): raise ValueError(funcid + "var Requires a string") if dvar is not None and not isinstance(dvar, str): raise ValueError(funcid + "dvar Requires a string") if dvar is None: dvar = var print funcid + "Data from Variable: ", dvar if not isinstance(data, (pd.DataFrame, pd.Panel)): raise ValueError("Require a DataFrame or Panel as input") if quantilen is None: quantilen = np.arange(0, 101, 10) quantilen = quantilen[(quantilen < 100) & (quantilen > 0)] # drop 0 and 100 qss = sample_size / (len(quantilen) + 1) / 2 # sample size per quantile print funcid + "Quantilen: ", quantilen print funcid + "Global Sample size: %d , per quantile(%d): %d" % (sample_size, len(quantilen), qss) mlabels = ["Q%d" % i for i in quantilen] mlabels.append(">") if isinstance(data, pd.DataFrame): if not data.columns.isin([var, '%s_breaks' % var]).sum() == 2: raise ValueError(funcid + "Variable not found: %s or %s_breaks in %s" % (var, var, str(data.columns))) # convert to panel if 'p' not in data.columns: out = {} # get Breakpoints int_breaks = np.where((data['%s_breaks' % var] > 0))[0] breaks = data.index[int_breaks] nb = len(breaks) if nb == 0: raise RuntimeError(funcid + "No Breakpoints found in %s and %s_breaks" % (var, var)) print "Found Breaks: ", nb print str(breaks) if (int_breaks[-1] + sample_size) > data.shape[0]: print funcid + "Reference data set is shorter than 1 year" for ib in reversed(range(nb)): if ibreak is not None and ibreak != ib: print funcid + "Looking for: ", breaks[ibreak], " at ", breaks[ib] continue # ibiased is everything between breakpoints # isample is minus the borders -> used to calculate ibiased, isample, iref = sample_indices(int_breaks, ib, data.index, sample_size=sample_size, borders=borders, recent=False, verbose=verbose - 1) # Quantiles at the breakpoint b1, c1, quants1 = qstats(data[dvar].values[iref], quantilen, qss) b2, c2, quants2 = qstats(data[dvar].values[isample], quantilen, qss) if verbose > 0: print funcid + " %s : %s " % (dvar, breaks[ib]) print funcid + " Qs(B): ", quants1 print funcid + " Qs(#): ", c1 print funcid + " Qs(B): ", quants2 print funcid + " Qs(#): ", c2 out[str(breaks[ib])] = pd.DataFrame({'Ref': quants1.tolist(), 'Bias': quants2.tolist()}, index=mlabels) return out # when there are pressure levels data = data.reset_index().set_index(['date', 'p']).to_panel() else: if not data.items.isin([var, '%s_breaks' % var]).sum() == 2: raise ValueError(funcid + "Variable not found: %s or %s_breaks in %s" % (var, var, str(data.items))) # per level # get Breakpoints int_breaks = np.where((data['%s_breaks' % var] > 0).any(1))[0] breaks = data.major_axis[int_breaks] nb = len(breaks) if nb == 0: raise RuntimeError(funcid + "No Breakpoints found in %s and %s_breaks" % (var, var)) print "Found Breaks: ", nb print str(breaks) if (int_breaks[-1] + sample_size) > data.shape[0]: print funcid + "Reference data set is shorter than 1 year" out = {} for ib in reversed(range(nb)): if ibreak is not None and ibreak != ib: print funcid + "Looking for: ", breaks[ibreak], " at ", breaks[ib] continue # ibiased is everything between breakpoints # isample is minus the borders -> used to calculate ibiased, isample, iref = sample_indices(int_breaks, ib, data.major_axis, sample_size=sample_size, borders=borders, recent=False, verbose=verbose - 1) # Quantiles at the breakpoint def myqstats(x, quantilen, sample_size): c, y = qstats(x, quantilen, sample_size) return y quants1 = np.apply_along_axis(myqstats, 0, data[dvar].values[iref], quantilen, qss) quants2 = np.apply_along_axis(myqstats, 0, data[dvar].values[isample], quantilen, qss) out[str(breaks[ib])] = pd.Panel({'Ref': quants1, 'Bias': quants2}, major_axis=mlabels, minor_axis=data.minor_axis) return out
def quantile_era_correction(data, var, rvar, bvar, quantilen=None, sample_size=730, borders=None, bounded=None, database=False, verbose=0): from departures import qmap_era_departure from support_functions import sample_indices funcid = '[CQ] ' if isinstance(var, str): var = [var] # as list if isinstance(bvar, str): bvar = [bvar] * len(var) # as list if isinstance(rvar, str): rvar = [rvar] * len(var) # as list if quantilen is None: quantilen = np.arange(0, 101, 10) pressure_levels = True if isinstance(data, pd.DataFrame): if 'p' in data.columns: # 2D print funcid + " database detected" for ivar, jvar in zip(var, bvar): if not data.columns.isin([ivar, jvar]).sum() == 2: raise ValueError(funcid + "Variable not found: %s" % (str(data.columns))) data = data.reset_index().set_index(['date', 'p']).to_panel() else: # 1D pressure_levels = False elif isinstance(data, pd.Panel): for ivar, jvar in zip(var, bvar): if not data.items.isin([ivar, jvar]).sum() == 2: raise ValueError(funcid + "Variable not found: %s" % (str(data.items))) else: raise ValueError("Require a DataFrame or Panel as input") if pressure_levels: data.major_axis.name = 'date' dates = data.major_axis # Druckflächen plevels = data.minor_axis.values if verbose > 0: print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa' else: dates = data.index if bounded is None: ubound = None lbound = None else: lbound, ubound = bounded # What else ? qss = sample_size / len(quantilen) / 2 # if verbose > 0: print funcid + "Adjusting sample_size from %d to %d " % (sample_size, qss) print funcid + "Quantiles: %d" % len(quantilen) breakpoint_stat = {} for ivar, zvar, ibvar in zip(var, rvar, bvar): # BREAKS if pressure_levels: int_breaks = np.where((data[ibvar] > 0).any(1))[0] else: int_breaks = np.where((data[ibvar] > 0))[0] breaks = dates[int_breaks] if (int_breaks[-1] + sample_size) > dates.shape[0]: print funcid + "Reference data set is shorter than 1 year" # copy data["%s_qecor" % ivar] = data[ivar].copy() xdata = data["%s_qecor" % ivar].values # Numpy Array (time x p-levels) nb = len(breaks) # Correct zvar to fit var in Reference Period and use # zvar to quantile match the rest of the timeseries data['%s_adj' % zvar] = data[zvar].copy() ydata = data["%s_adj" % zvar].values # Numpy Array (time x p-levels) # jvar = data.items.get_loc("%s_adj" % zvar) if pressure_levels: for i in range(xdata.shape[1]): # data[ivar].values[:, i], # data[zvar].values[:, i], q_dep = qmap_era_departure(xdata[:, i], ydata[:, i], slice(int_breaks[-1], None), quantilen, qss) if bounded is not None: tmp_qad = ydata[:, i] + q_dep # data.values[jvar, :, i] + q_dep q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep) # data.iloc[jvar, :, i] += q_dep ydata[:, i] += q_dep else: # data[ivar].values, # data[zvar].values, q_dep = qmap_era_departure(xdata, ydata, slice(int_breaks[-1], None), quantilen, qss) if bounded is not None: tmp_qad = ydata + q_dep # data.iloc[:, jvar] + q_dep q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep) # data.iloc[:, jvar] += q_dep ydata[:, i] += q_dep # Breakpoint Loop for ib in reversed(range(nb)): # ibiased is everything between breakpoints # isample is minus the borders -> used to calculate isample, ibiased, iref = sample_indices(int_breaks, ib, dates, sample_size=sample_size, borders=borders, recent=False, verbose=verbose - 1) # if pressure_levels: # jvar = data.items.get_loc("%s_qecor" % ivar) for i in range(xdata.shape[1]): # data["%s_adj" % zvar].values[ibiased, i], # data[ivar].values[ibiased, i], q_dep = qmap_era_departure(ydata[ibiased, i], xdata[ibiased, i], slice(None, None), quantilen, qss) if bounded is not None: tmp_qad = xdata[ibiased, i] + q_dep # data.values[jvar, ibiased, i] + q_dep q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep) # data.values[jvar, ibiased, i] = (data.iloc[jvar, ibiased, i].values + q_dep)#[:, np.newaxis] # data.iloc[jvar, ibiased, i] += q_dep #[:, np.newaxis] # array (time) xdata[ibiased, i] += q_dep else: # jvar = data.columns.get_loc("%s_qecor" % ivar) # data["%s_adj" % zvar].values[ibiased], # data[ivar].values[ibiased], q_dep = qmap_era_departure(ydata[ibiased], xdata[ibiased], slice(None, None), quantilen, qss) if bounded is not None: tmp_qad = xdata[ibiased] + q_dep # data.values[ibiased, jvar] + q_dep q_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, q_dep) # data.iloc[ibiased, jvar] += q_dep # array ( time ) xdata[ibiased] += q_dep # nsample = data["%s_qecor" % ivar][isample].count().values # nref = data["%s_qecor" % ivar][iref].count().values nsample = np.isfinite(xdata[isample]).sum() nref = np.isfinite(xdata[iref]).sum() breakpoint_stat[str(breaks[ib])] = {'i': int_breaks[ib], 'isample': isample, 'ibiased': ibiased, 'iref': iref, 'nref': nref, 'nsamp': nsample} # if verbose > 0: # print funcid + " %s : %s 50%%: %9f (L%02d)" % ( # ivar, breaks[ib], np.nanmedian(q_dep), np.sum(np.any(q_dep != 0, axis=0))) data["%s_qecor" % ivar] = xdata if database: return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True) return breakpoint_stat, data
def surrogate_quantile_correction(data, var, dvar, quantilen=None, sample_size=730, borders=180, database=False, func='nanmean', verbose=0): from departures import qmap_var_departure from support_functions import sample_indices funcid = '[CS] ' if not isinstance(var, str) and not isinstance(dvar, str): raise ValueError("Only one variable allowed. As String") if quantilen is None: quantilen = np.arange(0, 100, 10) pressure_levels = True if isinstance(data, pd.DataFrame): if 'p' in data.columns: # 2D print funcid + " database detected" if var not in data.columns: raise ValueError(funcid + "Variable not found: %s in %s" % (var, str(data.columns))) if '%s_breaks' % var not in data.columns: raise ValueError(funcid + "Variable not found: %s_breaks in %s" % (var, str(data.columns))) if dvar not in data.columns: raise ValueError(funcid + "Variable not found: %s in %s" % (dvar, str(data.columns))) data = data.reset_index().set_index(['date', 'p']).to_panel() else: # 1D pressure_levels = False elif isinstance(data, pd.Panel): if var not in data.items: raise ValueError(funcid + "Variable not found: %s in %s" % (var, str(data.items))) if dvar not in data.items: raise ValueError(funcid + "Variable not found: %s in %s" % (dvar, str(data.items))) if '%s_breaks' % var not in data.items: raise ValueError(funcid + "Variable not found: %s_breaks in %s" % (var, str(data.items))) else: raise ValueError("Require a DataFrame or Panel as input") # always 2 variables -> dataframe at least if pressure_levels: data.major_axis.name = 'date' dates = data.major_axis # Druckflächen plevels = data.minor_axis.values if verbose > 0: print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa' else: dates = data.index # What else ? sample_size /= len(quantilen) # BREAKS if pressure_levels: int_breaks = np.where((data['%s_breaks' % var] > 0).any(1))[0] else: int_breaks = np.where((data['%s_breaks' % var] > 0))[0] breaks = dates[int_breaks] if (int_breaks[-1] + sample_size) > dates.shape[0]: print funcid + "Reference data set is shorter than 1 year" # copy data["%s_cor_%s" % (var, dvar)] = data[var] nb = len(breaks) for ib in reversed(range(nb)): if verbose > 0: print funcid + "Break: " + str(breaks[ib]) ibiased, iref = sample_indices(int_breaks, ib, dates, sample_size=sample_size, borders=180, recent=False, verbose=verbose) if pressure_levels: ## Mittler Unterschied pro Quantile von einer anderen abhängigen variablen q_dep = np.empty_like(data[var].values[ibiased, :]) for i in range(data.shape[2]): q_dep[:, i] = qmap_var_departure(data["%s_cor_%s" % (var, dvar)].values[:, i], data[dvar].values[:, i], iref, ibiased, ibiased, quantilen, sample_size, verbose=verbose) data["%s_cor_%s" % (var, dvar)].values[ibiased, :] += q_dep # one value per level else: q_dep = qmap_var_departure(data["%s_cor_%s" % (var, dvar)].values, data[dvar].values, iref, ibiased, ibiased, quantilen, sample_size, verbose=verbose, func=func) data["%s_cor_%s" % (var, dvar)].values[ibiased] += q_dep if verbose > 0: print funcid + " %s : %s 50%%: %f" % (var, breaks[ib], np.nanmedian(q_dep)) if database: return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True) return data
def mean_correction(data, var, breakvar, sample_size=730, borders=180, database=False, bounded=None, varcopy=True, verbose=0): """ Mean Correction of breakpoints Parameters ---------- data var breakvar sample_size borders database bounded varcopy verbose Returns ------- stat, data """ from departures import mean_departure from support_functions import sample_indices funcid = '[CM] ' if isinstance(var, str): var = [var] # as list if isinstance(breakvar, str): breakvar = [breakvar] * len(var) # as list if bounded is None: ubound = None lbound = None else: lbound, ubound = bounded pressure_levels = True if isinstance(data, pd.DataFrame): if 'p' in data.columns: # 2D print funcid + " database detected > conversion to Panel" for ivar, jvar in zip(var, breakvar): if not data.columns.isin([ivar, jvar]).sum() == 2: raise ValueError(funcid + "Variable not found: %s in %s" % (ivar, str(data.columns))) data.index.name = 'date' data = data.reset_index().set_index(['date', 'p']).to_panel() else: # only 1D pressure_levels = False elif isinstance(data, pd.Panel): for ivar, jvar in zip(var, breakvar): if not data.items.isin([ivar, jvar]).sum() == 2: raise ValueError(funcid + "Variable not found: %s in %s" % (ivar, str(data.items))) else: raise ValueError("Require a DataFrame or Panel as input") if pressure_levels: data.major_axis.name = 'date' dates = data.major_axis # Druckflächen plevels = data.minor_axis.values if verbose > 0: print funcid + "p-Levels: ", ",".join(["%d" % (ip / 100) for ip in plevels]), ' hPa' else: dates = data.index for ivar, ibvar in zip(var, breakvar): # BREAKS if pressure_levels: int_breaks = np.where((data[ibvar] > 0).any(1))[0] # breakpoint in all levels else: int_breaks = np.where((data[ibvar] > 0))[0] breaks = dates[int_breaks] if (int_breaks[-1] + sample_size) > dates.shape[0]: print funcid + "Reference data set is shorter than 1 year" # Copy or use existing if not hasnames(data, '%s_mcor' % ivar) or varcopy: data["%s_mcor" % ivar] = data[ivar].copy() # Make a copy nb = len(breaks) if verbose > 0: print funcid + " %s Found %d breakpoints" % (ivar, nb) breakpoint_stat = {} xdata = data["%s_mcor" % ivar].values # Numpy Array (time x p-levels) for ib in reversed(range(nb)): # ibiased is everything between breakpoints # isample is minus the borders -> used to calculate isample, ibiased, iref = sample_indices(int_breaks, ib, dates, sample_size=sample_size, borders=borders, recent=False, verbose=verbose - 1) if pressure_levels: # jvar = data.items.get_loc("%s_mcor" % ivar) # index of variable # data["%s_mcor" % ivar].values, m_dep = np.apply_along_axis(mean_departure, 0, xdata, iref, isample, sample_size) # setting with ndarray requires precise shape conditions if bounded is not None: tmp_qad = xdata[ibiased, :] + m_dep # data.iloc[jvar, ibiased, :] + m_dep m_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, m_dep) xdata[ibiased, :] += m_dep # has now the right shape else: # data.iloc[jvar, ibiased, :] = (data.iloc[jvar, ibiased, :].values + m_dep)[np.newaxis, ::] xdata[ibiased, :] += m_dep[np.newaxis, ::] # one value per level, this can cause negative DPD values else: # jvar = data.columns.get_loc("%s_mcor" % ivar) # data["%s_mcor" % ivar].values, m_dep = mean_departure(xdata, iref, isample, sample_size) if bounded is not None: tmp_qad = xdata[ibiased] + m_dep # data.iloc[ibiased, jvar] + m_dep m_dep = np.where((tmp_qad < lbound) | (tmp_qad > ubound), 0, m_dep) # data.iloc[ibiased, jvar] += m_dep # one value per time xdata[ibiased] += m_dep # nsample = data["%s_mcor" % ivar][isample].count() # nref = data["%s_mcor" % ivar][iref].count() nsample = np.isfinite(xdata[isample]).sum() nref = np.isfinite(xdata[iref]).sum() breakpoint_stat[str(breaks[ib])] = {'i': int_breaks[ib], 'isample': isample, 'ibiased': ibiased, 'iref': iref, 'mcor': m_dep, 'nref': nref, 'nsamp': nsample} if verbose > 0: print funcid + " %s : %s 50%%: %9f " % (ivar, breaks[ib], np.nanmedian(m_dep)) data["%s_mcor" % ivar] = xdata # fill in if database: return data.to_frame(filter_observations=False).reset_index().set_index('date', drop=True) return breakpoint_stat, data