def interp_dataframe(data, levels=None, variables=None, min_count=5, dropna=True, report=None, verbose=0): """ Interpolate: 1. Select only levels with enough (min_count) t and r values 2. Interpolate each profile (date) vertically to levels Interpolation is only done at dates with enough data Args: data pandas DataFrame with p Keyword Args: levels [1000 ... 100000] mod. ERA-Interim pressure levels variables ['t','r'] Variables for quality check (min_count) min_count [5] Minimum required levels per profile dropna [True] Remove missing values before interpolation verbose [0] Show more info Returns: newdata pandas DataFrame with newly interpolated values Raises: ValueError when p or other variables not inside data Calls: interp_profile -> interp_mod """ from raso.config import std_plevels funcid = "[INTP] " if levels is None: levels = std_plevels if variables is None: variables = data.columns.tolist() # fix index name if data.index.name is None or data.index.name == '': data.index.name = 'date' index_name = data.index.name # check plevels vs plevels inside dataframe ? variables = list(set(variables + ['p'])) # add p variables = data.columns[data.columns.isin(variables)].tolist() # inside ? if len(variables) < 1 or 'p' not in variables: raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables))) journal(funcid + "Quality check (%s) Min: %d NAN: %s" % (",".join(variables), min_count, color_boolean(dropna)), report, verbose) # sometimes one variables is completely zero counts = data.count()[variables] if (counts == 0).any(): variables = counts[counts > 0].index.tolist() if len(variables) < 1 or 'p' not in variables: print counts raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables))) itx = (data.groupby(data.index).count()[variables] > min_count).sum(1) > 2 # at least 2 variables have data if itx.sum() == 0: raise RuntimeError(funcid + "No data left with %d as minimum data count per profile" % min_count) else: data = data.ix[itx, :] journal(funcid + "from %d to %d" % (len(itx), itx.sum()), report, verbose) # data = data.groupby(data.index).apply(interp_profile, variables=variables, pout=levels, oan=True, dropna=dropna, min_values=min_count) data = data.reset_index().drop('level_1', axis=1).sort_values(by=[index_name, 'p']).set_index(index_name, drop=True) journal(funcid + "Done: %s" % str(data.shape), report, verbose) return data
def interp_profile_check(x, var='t', pout=None, pcolumn='p', dropna=True, min_values=4): """ Check Interpolation for given variable Args: x pandas DataFrame holding one profile (date) Keyword Args: var ['t'] Variable to check interpolation pout [1000 ... 100000] mod. ERA-Interim pressure levels oan [False] Return ORIG column info pcolumn ['p'] name of pressure column dropna [True] Remove missing values before interpolation min_values [4] Minimum required levels per profile Returns: prints interpolation results Raises: ValueError if x is not a pandas DataFrame KeyError if pcolumn not in x RuntimeError if more than one index (date) is in x Calls: interp_mod """ from raso.config import std_plevels if not isinstance(x, pd.DataFrame): raise ValueError("[INTP] Requires a DataFrame") if pcolumn not in x.columns: raise KeyError("[INTP] Pressure levels not in DataFrame") if len(x.index.unique()) > 1: raise RuntimeError("[INTP] Only one profile allowed here! ") if pout is None: pout = std_plevels pin = x[pcolumn].values # take pressures ipx = np.argsort(pin) # find accending order pin = pin[ipx] # apply sort to pressures pout = np.unique(np.sort(np.concatenate([pin, pout]))) # merge all levels, sort, unique data = x.drop(pcolumn, axis=1) # take values values = data[var].values[ipx] # apply sort res = interp_mod(values, np.log(pout), np.log(pin), dropna=dropna, min_values=min_values) for i, p in enumerate(pin): # first level exception if i == 0: for no in np.where(pout < p)[0]: print "IN: ( %9s %7s ) OUT: ( %9.2f %7.2f) " % ('-', '-', pout[no], res[no]) else: for no in np.where(np.logical_and(pout > pin[i - 1], pout < p))[0]: print "IN: ( %9s %7s ) OUT: ( %9.2f %7.2f) " % ('-', '-', pout[no], res[no]) # is the output level and input level? v1 = np.where(pout == p)[0] if len(v1) == 0: print "IN: ( %9.2f %7.2f ) OUT: ( %9s %7s) " % (p, data[i], '-', '-') else: print "IN: ( %9.2f %7.2f ) OUT: ( %9.2f %7.2f) %s " % ( p, values[i], pout[v1], res[v1], color_boolean(np.all((values[i] - res[v1]) == 0.0))) for no in np.where(pout > pin[-1])[0]: print "IN: ( %9s %7s ) OUT: ( %9.2f %7.2f) " % ('-', '-', pout[no], res[no])