def merge(data, attach, standard_times=True, suffix='era', report=None, verbose=0): """Merge data and attach to one dataframe Matching criteria are date and p return dataframe with merged columns Parameters ---------- data DataFrame Original Data attach DataFrame Data that should be merged with Original standard_times bool Consider only 0 and 12 UTC for merging suffix str Suffix for Columns verbose int verboseness """ # convert sonde data to standard_times to match better with ERA-Interim data? if 'p' not in data.columns or 'p' not in attach.columns: raise ValueError("Requires p in dataframes") # print_verbose("Matching levels: %d"%(np.sum( np.in1d( data.p.unique(),attach.p.unique())) ) ,verbose) # Match Index names data.index.name = 'date' attach.index.name = 'date' if standard_times: data = standard_dates_times(data, report=report, verbose=verbose) journal('[MERGE] Data with standard times: %s' % str(data.shape), report, verbose) if '_' not in suffix[0]: suffix = '_' + suffix return pd.merge(data.reset_index(), attach.rename(columns=lambda x: x + suffix).reset_index(), left_on=['date', 'p'], right_on=['date', 'p'+suffix], how='left').drop('p'+suffix, 1).set_index('date', drop=True)
def standard_data(data, interpolate=True, int_vars=None, method='murphy_koop', levels=None, init_replace=True, replace=False, daynight=True, report=None, verbose=0, **kwargs): """Standardize radiosonde data 1. Quality Control 2. Datetime fix (0/12 Soundings) 3. Interpolation / std levels (ERA Levels) 4. Variable conversion (td, vp, dpd) 5. Final Quality Control Parameters ---------- data pd.DataFrame Profile Data in Database format interpolate bool Interpolate to levels (default: era_plevels) method str saturation water vapor pressure formulation levels list pressure levels for interpolation init_replace bool First quality check repalce? replace bool replace flagged values with NAN daynight bool use only: 00 and 12 UTc verbose int verboseness kwargs dict Returns ------- pd.DataFrame Raises ------ """ from raso.config import era_plevels from qc import control from dpd_anomaly import dpd_anomaly from interpolation import interp_dataframe from standard_dates_times import standard_dates_times funcid = "[STD] Data " if not isinstance(data, pd.DataFrame): raise ValueError(funcid + "Requires a DataFrame Database style") start = set_starttime() ############################################################################ # # Create a private copy and modify it # data = data.copy() # active copy ############################################################################ # # QC ( replace with NAN) # data = control(data, replace=init_replace, report=report, verbose=verbose - 1) # # DPD 30 Detection # if 'dpd' in data.columns: # for IGRA ? data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report, verbose=verbose - 1) # # Standard dates, times, remove duplicates!! # standard_dates_times(data, inplace=True, report=report, verbose=verbose - 1) journal(funcid + 'Standard Dates-Times (%s)' % str(data.shape), report, verbose) # if daynight: itime = data.index.hour * 100 + data.index.minute itx = (itime == 0) | (itime == 1200) # including minutes data = data.ix[itx, :].copy() # Limit to 0 and 12 journal(funcid + "Data [0, 12]: %s" % str(data.shape), report, verbose) ############################################################################ # # Convert humidity variable to relative humidity for interpolation # See manual standard_rel_humidity(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) ############################################################################ # # Standard levels / interpolate # take_era = False if levels is None: levels = era_plevels # a lot of levels ? take_era = True if interpolate: if int_vars is None: int_vars = ['p', 't', 'r'] data = interp_dataframe(data, variables=int_vars, levels=levels, report=report, verbose=verbose - 1) data = data.query('orig>0').copy() # .drop('orig',1) # Only Standard Levels else: cmd = "|".join(["p==%d" % ip for ip in levels]) data = data.query(cmd) journal(funcid + 'Level selection (%d), ERA (%s)' % (len(levels), take_era), report, verbose) ############################################################################ # # Final Quality Control # data = control(data, replace=replace, report=report, verbose=verbose - 1) ############################################################################ # # Convert RH to vp, dpd # standard_water_vapor(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) # standard_dewpoint_depression(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) # # DPD 30 Detection # if 'dpd' in data.columns: data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report, verbose=verbose - 1) ############################################################################ if verbose > 0: print_time(start) return data
def standard_dewpoint_depression(data, tvar='t', dpdvar='dpd', vpvar='vp', tdvar='td', update=False, replace=False, method='murphy_koop', inplace=False, report=None, verbose=0, **kwargs): """ Convert water vapor pressure to dewpoint depression Parameters ---------- data pd.DataFrame Input Radiosonde Data tvar str Temperature variable to use: t, t_cor dpdvar str Dewpoint Dep. variable to use: dpd, dpd_mcor vpvar str Water vapor pressure: vp method str saturation water vapor formulation update bool update dpd inplace bool apply directly to input / no copy ? verbose int verbosness kwargs dict ** Returns ------- pd.DataFrame / same as input (add columns) """ from raso.met.conversion import dewpoint dpd_absmax = 60. ############################################################################ funcid = "[SV] " if not isinstance(data, (pd.DataFrame, pd.Panel)): raise ValueError(funcid + "Requires a Dataframe or Panel") if not inplace: data = data.copy() if not hasnames(data,'qual'): data['qual'] = '' if not hasnames(data, vpvar): raise ValueError(funcid + " Requires a humidity variable: vp") if not hasnames(data, tvar): raise ValueError(funcid + " Requires variable: t") ############################################################################ # # Dewpoint / for IGRA mostly # if not hasnames(data, dpdvar) or update: dpd = data[tvar] - dewpoint(data[vpvar], method=method) if hasnames(data, dpdvar): logic = (np.isfinite(dpd) & ~np.isfinite(data[dpdvar].values)) # Update? GOOD, BAD data[dpdvar] = np.where(logic, dpd, data[dpdvar].values) # Update? NEW, OLD data[tdvar] = data[tvar].values - np.where(logic, dpd, data[dpdvar].values) # Update? NEW, OLD else: data[dpdvar] = dpd data[tdvar] = data[tvar].values - dpd journal(funcid + "DPD (%s, %s) from vp (%s), replace: %s" % (dpdvar, tvar, vpvar, replace), report, verbose) # # DPD valid range ? # logic = ((data[dpdvar].values < 0) | (data[dpdvar].values > dpd_absmax)) # BAD, GOOD if replace: data[dpdvar] = np.where(logic, np.nan, data[dpdvar].values) # set dpd data[vpvar] = np.where(logic, np.nan, data[vpvar].values) # set vp data[tdvar] = np.where(logic, np.nan, data[tdvar].values) # set Td data['qual'] = np.where(logic, data.qual.values + 'D', data.qual.replace('D', '').values) # FLAG: D # # Dewpoint larger than Temperature! # logic = (data[tdvar].values > data[tvar].values) # BAD, GOOD if replace: data[dpdvar] = np.where(logic, np.nan, data[dpdvar].values) # set dpd to small value data[tdvar] = np.where(logic, np.nan, data[tdvar].values) # set Td to small value data[vpvar] = np.where(logic, np.nan, data[vpvar].values) # set vp data['qual'] = np.where(logic, data.qual.values + 'Y', data.qual.replace('Y', '').values) # FLAG: D ############################################################################ # # Unique Flags # data['qual'] = unique_flags(data['qual']) if not inplace: return data
def standard_water_vapor(data, tvar='t', dpdvar='dpd', vpvar='vp', rvar='r', qvar='q', update=False, replace=False, method='murphy_koop', inplace=False, report=None, verbose=0, **kwargs): """ Convert humidity variables to water vapor pressure 1. RH (Esat) 2. Q (P) 3. DPD (Esat) --> VP Limits are from the RTTOV Coefficient file table 54 levels Notes ----- http://nwpsaf.eu/oldsite/deliverables/rtm/rttov11_coefficients.html#54L_reg_limits Parameters ---------- data DataFrame Input database tvar str dpdvar str vpvar str rvar str qvar str update bool esat str inplace bool verbose int kwargs ** Returns ------- data """ from raso.met.conversion import sh2vap from raso.met import esat_functions from raso.qc import profile_limits ############################################################################ rt = profile_limits(tohpa=True, simple_names=True) # RTTOV Variable Limits rt['p'] *= 100. # hPa to Pa rt['vpmin'] *= 100. # hPa to Pa rt['vpmax'] *= 100. # hPa to Pa ############################################################################ funcid = "[SV] " if not isinstance(data, (pd.DataFrame, pd.Panel)): raise ValueError(funcid + "Requires a Dataframe or Panel") if not inplace: data = data.copy() if not hasnames(data, 'qual'): data['qual'] = '' if hasnames(data, [rvar, qvar, dpdvar], value=0): raise ValueError(funcid + " Requires a humidity variable: r, q or dpd") if hasnames(data, tvar, value=0): raise ValueError(funcid + " Requires variable: t") vpfunc = getattr(esat_functions, method) if not hasnames(data, vpvar) or update: remove_pressure = False if isinstance(data, pd.Panel): data['p'] = 0. # add minor_axis as p data.loc['p', :, :] = np.asarray(data.minor_axis)[np.newaxis, np.newaxis, :] remove_pressure = True ndates = data.shape[1] * data.shape[2] else: ndates = data.shape[0] ############################################################################ # First use R, which includes Q # R # if hasnames(data, rvar): journal(funcid + "Using r (%s) and t (%s) for vp (%s)" % (rvar, tvar, vpvar), report, verbose) data[vpvar] = data[rvar].values * vpfunc(data[tvar].values) # Convert r,t to vp ############################################################################ # # Q # elif hasnames(qvar, data): journal(funcid + "Using q (%s) and p to fill up gaps for vp (%s)" % (qvar, vpvar), report, verbose) vp = sh2vap(data[qvar].values, data['p'].values) # Convert q,p to vp if hasnames(data, vpvar): logic = (np.isfinite(vp) & ~np.isfinite(data[vpvar].values)) data[vpvar] = np.where(logic, vp, data[vpvar].values) # TODO missing else # data.loc[:, vpvar] = np.where(logic, vp, data[vpvar].values) # data.loc[:, 'qual'] = np.where(logic, data.qual.values + 'Q', data.qual.values) # FLAG: F fill ############################################################################ # # Second use DPD (IGRA) # DPD elif hasnames(data, dpdvar): journal(funcid + "Warning using dpd (%s) for vp (%s)" % (dpdvar, vpvar), report, verbose) vp = vpfunc((data[tvar] - data[dpdvar]).values) # Convert Td to vp if hasnames(data, vpvar): logic = (np.isfinite(vp) & ~np.isfinite(data[vpvar].values)) # Update? GOOD, BAD data[vpvar] = np.where(logic, vp, data[vpvar].values) # Update? NEW, OLD else: data[vpvar] = vp else: raise RuntimeError("No humidity variable found!") ############################################################################ # # Quality control # vpmins = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmin.values, left=rt.vpmin.min(), right=rt.vpmin.max()) # Interpolate Minimum vpmaxs = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmax.values, left=rt.vpmax.min(), right=rt.vpmax.max()) # Interpolate Maximum # Range? BAD, GOOD logic = ((data[vpvar].values < vpmins) | (data[vpvar].values > vpmaxs)) & np.isfinite(data[vpvar].values) data['qual'] = np.where(logic, data.qual.values + 'V', data.qual.replace('V', '').values) # FLAG: V # data.loc[:, 'qual'] = np.where(logic, data.qual.values + 'V', data.qual.replace('V', '').values) # FLAG: V journal(funcid + "#%8d V flagged. (%d)" % (np.sum(np.sum(flag_inside(data.qual, 'V'))), ndates), report, verbose) if replace: data[vpvar] = np.where(logic, np.nan, data[vpvar].values) # Apply? BAD, GOOD # data.loc[:, vpvar] = np.where(logic, np.nan, data[vpvar].values) # Apply? BAD, GOOD ############################################################################ # # Unique Flags # data['qual'] = unique_flags(data['qual']) ############################################################################ # # Pressure # if remove_pressure: del data['p'] if not inplace: return data
def standard_rel_humidity(data, rvar='r', tvar='t', dpdvar='dpd', qvar='q', update=False, replace=False, method='murphy_koop', inplace=False, report=None, verbose=0): """ convert humidity variables to relative humidity 1. q to vp 2. dpd to vp 3. merge vp 4. vp to rh set quality flags Parameters ---------- data DataFrame Input database rvar str tvar str dpdvar str qvar str update bool Update existing replace bool set flagged to nan method str Saturation water vapor inplace bool verbose int kwargs ** Returns ------- data """ from raso.met.conversion import sh2vap from raso.met import esat_functions from raso.qc import profile_limits ############################################################################ rt = profile_limits(tohpa=True, simple_names=True) # RTTOV Variable Limits rt['p'] *= 100. # hPa to Pa rt['vpmin'] *= 100. # hPa to Pa rt['vpmax'] *= 100. # hPa to Pa ############################################################################ funcid = "[SV] " r_absmin = 0 r_absmax = 1 if not isinstance(data, (pd.DataFrame, pd.Panel)): raise ValueError(funcid + "Requires a Dataframe or Panel") if not inplace: data = data.copy() if not hasnames('qual', data): data['qual'] = '' if hasnames(data, [rvar, qvar, dpdvar], value=0): raise ValueError(funcid + " Requires a humidity variable: %s, %s or %s" % (rvar, qvar, dpdvar)) if hasnames(data,tvar, value=0): raise ValueError(funcid + " Requires variable: %s" % tvar) vpfunc = getattr(esat_functions, method) if not hasnames(data, rvar) or update: remove_pressure = False if isinstance(data, pd.Panel): data['p'] = 0. # add minor_axis as p data.loc['p', :, :] = np.asarray(data.minor_axis)[np.newaxis, np.newaxis, :] remove_pressure = True journal(funcid + "rel. humidity (%s) update: %s" % (rvar, update), report, verbose) # Quality control vpmins = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmin.values, left=rt.vpmin.min(), right=rt.vpmin.max()) # Interpolate Minimum vpmaxs = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmax.values, left=rt.vpmax.min(), right=rt.vpmax.max()) # Interpolate Maximum vpsat = vpfunc(data[tvar].values) if hasnames(data, rvar): vp = data[rvar].values * vpsat # Convert r,t to vp logic = ((vp < vpmins) | (vp > vpmaxs)) & np.isfinite(vp) data['qual'] = np.where(logic, data.qual.values + 'R', data.qual.replace('R', '').values) # FLAG: R (?) journal(funcid + "rel. humidity (%s) available (replace: %s)" % (rvar, replace), report, verbose) if replace: vp = np.where(logic, np.nan, vp) # Apply? BAD, GOOD else: vp = np.full(data[tvar].shape, np.nan) if hasnames(data, qvar): qvp = sh2vap(data[qvar].values, data['p'].values) # only a formula no approximation logic = ((qvp < vpmins) | (qvp > vpmaxs)) & np.isfinite(qvp) data['qual'] = np.where(logic, data.qual.values + 'Q', data.qual.replace('Q', '').values) # FLAG: R (?) journal(funcid + "spec. humidity (%s) available (replace: %s)" % (qvar, replace), report, verbose) if replace: qvp = np.where(logic, np.nan, qvp) # Apply? BAD, GOOD # Fill Gaps logic = (np.isfinite(qvp) & (~np.isfinite(vp))) # GOOD, BAD vp = np.where(logic, qvp, vp) # UPDATE, OLD if hasnames(data, dpdvar): dvp = vpfunc((data[tvar] - data[dpdvar])) # Dewpoint -> vp logic = ((dvp < vpmins) | (dvp > vpmaxs)) & np.isfinite(dvp) data['qual'] = np.where(logic, data.qual.values + 'D', data.qual.replace('D', '').values) # FLAG: R (?) journal(funcid + "DPD (%s) available (replace: %s)" % (dpdvar, replace), report, verbose) if replace: dvp = np.where(logic, np.nan, dvp) # Apply? BAD, GOOD # fill gaps logic = (np.isfinite(dvp) & (~np.isfinite(vp))) # GOOD, BAD vp = np.where(logic, dvp, vp) # UPDATE, OLD ############################################################################ # # Convert VP to RH # data[rvar] = vp / vpsat # rel. Humidity logic = ((data[rvar].values < r_absmin) | (data[rvar].values > r_absmax)) & np.isfinite( data[rvar].values) # Range? BAD, GOOD data['qual'] = np.where(logic, data.qual.values + 'R', data.qual.replace('R', '').values) # FLAG: R if replace: data[rvar] = np.where(logic, np.nan, data.r.values) # Apply? BAD, GOOD ############################################################################ # # Unique Flags # data['qual'] = unique_flags(data['qual']) ############################################################################ # # Pressure # if remove_pressure: del data['p'] if not inplace: return data
def standard_dates_times(data, keep=False, inplace=False, night_noon=False, report=None, verbose=0, **kwargs): """Fix datetime index to standard sounding times (0, 6, 12, 18) pm 3h Parameters ---------- data pd.DataFrame Radiosonde data, requireing a datetime index keep bool attach old index? inplace bool apply directly to input / no copy ? night_noon bool Select only 00 and 12 UTC verbose int verbosness kwargs dict ** Returns ------- pd.DataFrame / same as input """ funcid = "[SDT] " data.index.name = 'date' itime = data.index.hour*100 + data.index.minute # 1200 oder 0 oder 600 # Pre-select only relevant cases if night_noon: itx = (itime == 0) | (itime == 1200) else: itx = (itime == 0) | (itime == 600) | (itime == 1200) | (itime == 1800) if itx.size == 0: return data rx = data.index[~itx] if night_noon: rx = map(_fix_datetime_night_noon, rx) # APPLY FUNCTION else: rx = map(_fix_datetime, rx) # APPLY FUNCTION if not inplace: data = data.copy() data['newdate'] = data.index # we have date and newdate ! data['old_index'] = data['newdate'].copy() # strings ? data.loc[~itx, 'newdate'] = rx # check for duplicates idouble = data['newdate'] != data['old_index'] # what we changed itimes = data.ix[idouble, 'newdate'].unique() # only these dates if len(itimes) > 0: # only dates are important now justdates = data.loc[data.newdate.isin(itimes), ['newdate']] justdates.index.name = 'old_index' justdates = justdates.reset_index() # index, date, newdate # select all dates and group them. Check if the old index has multiple dates. If yes split again. counts = justdates.groupby('newdate').apply(lambda x: len(x['old_index'].unique())) duplicates = counts[counts > 1].index # How many duplicates are there? journal(funcid + "Changed: %d, Duplicates: %d" % (len(itimes), len(duplicates)), report, verbose) if len(duplicates) > 0: justdates = justdates[justdates.newdate.isin(duplicates)] # Selection justdates['tdiff'] = justdates['newdate'] - justdates['old_index'] justdates['tdiff'] /= np.timedelta64(1, 'h') # normalize to hours justdates = justdates.groupby('newdate').apply(fix_dates) justdates = justdates.set_index('old_index') data.loc[data.newdate.isin(duplicates), 'newdate'] = justdates['newdate'] # set back # general duplication check, because we can have real duplicates!!! if inplace: n = data.shape[0] data.drop_duplicates(subset=['newdate','p'], inplace=True) n2 = data.shape[0] else: n = data.shape[0] data = data.drop_duplicates(subset=['newdate','p']) n2 = data.shape[0] if n2 != n: journal(funcid + "General Duplicates removed: %d"%(n - n2), report, verbose) if night_noon: # calculate time difference between old and new index data['delta_t'] = data['newdate'] - data['old_index'] data.set_index('newdate', inplace=True) data.index.name = 'date' # fix name if not keep: del data['old_index'] if not inplace: return data
def merge_interpolate(data1, data2, variables=None, dropna=True, min_count=5, standard_time=True, d2name='era', report=None, verbose=0): """ Interpolate and Merge two datasets: 1. convert to standard times (0,6,12,18) 2. Merge Datasets 3. Vertical logp interpolation Interpolation levels are the union of both datasets Parameters ---------- data1 DataFrame ['p', vars ... ] data2 DataFrame ['p', vars ... ] variables list list of variables to use dropna bool remove missing values before interpolation min_count int minimum values per profile standard_times bool convert data to standard times Returns ------- newdata DataFrame ['p', vars ... ] Raises ------ ValueError not a pandas DataFrame RuntimeError missing p column """ from interpolation import interp_profile funcid = "[M] " if not isinstance(data1, pd.DataFrame) or not isinstance(data2, pd.DataFrame): raise ValueError(funcid + "Requires a pandas DataFrame as input") if 'p' not in data1.columns or 'p' not in data2.columns: raise RuntimeError(funcid + "Missing pressure column: p") # check variables ... if variables is not None: # check if p,t,r are in variables? if 'p' not in variables: if isinstance(variables, str): variables = list(variables) variables.append('p') data1 = data1[variables].copy() data2 = data2[variables].copy() journal(funcid + "Subsetting ... %s" % ",".join(variables), report, verbose) data1.index.name = 'date' data2.index.name = 'date' # match indices & merge (outer -> add both sides if standard_time: data1 = standard_dates_times(data1) # 0,6,12,18 else: data1 = data1.copy() # rename era variables and make sure we have floats!! data2 = data2.astype('float64').rename(columns=lambda x: x + "_%s" % d2name).rename( columns={'p_%s' % d2name: 'p'}).reset_index() # add marker columns !? data2['orig_%s' % d2name] = 1 data1['orig_raso'] = 1 journal(funcid + " Merging %s with %s on date and p" %(str(data1.shape), str(data2.shape)), report, verbose) # Merge # CHECK ? no data is lost ? alldata = pd.merge(data1.reset_index(), data2, left_on=['date', 'p'], right_on=['date', 'p'], how='outer').set_index('date', drop=True) # 0 raw, 1 both, 2 era alldata['source'] = np.where(np.isfinite(alldata['orig_%s' % d2name]), 2, 0) # set ERA to 2 alldata['source'] = np.where((np.isfinite(alldata['orig_raso'])) & (alldata['source'] == 2), alldata['source'] - 1, alldata['source']) # set RASO+ERA to 1 alldata.drop(['orig_era', 'orig_raso'], 1, inplace=True) alldata['source'] = alldata['source'].astype(str) if 'orig' in alldata.columns: alldata['orig'] = np.int_(alldata['orig'].values) alldata['orig'] = alldata['orig'].astype(str) # every profile / define wrapper function # pout -> raso p-levels # pin -> era p-levels # data -> only era data mod_interp = lambda x: interp_profile(x, pout=x['p'].values, pcolumn='p', dropna=dropna, min_values=min_count) journal(funcid+ "Variables: %s" % ",".join(alldata.columns), report, verbose) journal(funcid + " Interpolating %s (NA: %s, Min: %d)" % (str(alldata.shape), dropna, min_count), report, verbose) newdata = alldata.groupby(alldata.index).apply(mod_interp) # drop ... ? newdata = newdata.reset_index().drop('level_1', axis=1) # Multi-index to long / sort newdata = newdata.sort_values(by=['date', 'p']).set_index('date', drop=True) alldata['source'] = alldata['source'].astype(int) if 'orig' in alldata.columns: alldata['orig'] = alldata['orig'].astype(int) journal(funcid + " finished: %s > %s" % (str(data1.shape), str(newdata.shape)), report, verbose) return newdata
def interp_dataframe(data, levels=None, variables=None, min_count=5, dropna=True, report=None, verbose=0): """ Interpolate: 1. Select only levels with enough (min_count) t and r values 2. Interpolate each profile (date) vertically to levels Interpolation is only done at dates with enough data Args: data pandas DataFrame with p Keyword Args: levels [1000 ... 100000] mod. ERA-Interim pressure levels variables ['t','r'] Variables for quality check (min_count) min_count [5] Minimum required levels per profile dropna [True] Remove missing values before interpolation verbose [0] Show more info Returns: newdata pandas DataFrame with newly interpolated values Raises: ValueError when p or other variables not inside data Calls: interp_profile -> interp_mod """ from raso.config import std_plevels funcid = "[INTP] " if levels is None: levels = std_plevels if variables is None: variables = data.columns.tolist() # fix index name if data.index.name is None or data.index.name == '': data.index.name = 'date' index_name = data.index.name # check plevels vs plevels inside dataframe ? variables = list(set(variables + ['p'])) # add p variables = data.columns[data.columns.isin(variables)].tolist() # inside ? if len(variables) < 1 or 'p' not in variables: raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables))) journal(funcid + "Quality check (%s) Min: %d NAN: %s" % (",".join(variables), min_count, color_boolean(dropna)), report, verbose) # sometimes one variables is completely zero counts = data.count()[variables] if (counts == 0).any(): variables = counts[counts > 0].index.tolist() if len(variables) < 1 or 'p' not in variables: print counts raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables))) itx = (data.groupby(data.index).count()[variables] > min_count).sum(1) > 2 # at least 2 variables have data if itx.sum() == 0: raise RuntimeError(funcid + "No data left with %d as minimum data count per profile" % min_count) else: data = data.ix[itx, :] journal(funcid + "from %d to %d" % (len(itx), itx.sum()), report, verbose) # data = data.groupby(data.index).apply(interp_profile, variables=variables, pout=levels, oan=True, dropna=dropna, min_values=min_count) data = data.reset_index().drop('level_1', axis=1).sort_values(by=[index_name, 'p']).set_index(index_name, drop=True) journal(funcid + "Done: %s" % str(data.shape), report, verbose) return data