def merge(data, attach, standard_times=True, suffix='era', report=None, verbose=0): """Merge data and attach to one dataframe Matching criteria are date and p return dataframe with merged columns Parameters ---------- data DataFrame Original Data attach DataFrame Data that should be merged with Original standard_times bool Consider only 0 and 12 UTC for merging suffix str Suffix for Columns verbose int verboseness """ # convert sonde data to standard_times to match better with ERA-Interim data? if 'p' not in data.columns or 'p' not in attach.columns: raise ValueError("Requires p in dataframes") # print_verbose("Matching levels: %d"%(np.sum( np.in1d( data.p.unique(),attach.p.unique())) ) ,verbose) # Match Index names data.index.name = 'date' attach.index.name = 'date' if standard_times: data = standard_dates_times(data, report=report, verbose=verbose) journal('[MERGE] Data with standard times: %s' % str(data.shape), report, verbose) if '_' not in suffix[0]: suffix = '_' + suffix return pd.merge(data.reset_index(), attach.rename(columns=lambda x: x + suffix).reset_index(), left_on=['date', 'p'], right_on=['date', 'p'+suffix], how='left').drop('p'+suffix, 1).set_index('date', drop=True)
def standard_data(data, interpolate=True, int_vars=None, method='murphy_koop', levels=None, init_replace=True, replace=False, daynight=True, report=None, verbose=0, **kwargs): """Standardize radiosonde data 1. Quality Control 2. Datetime fix (0/12 Soundings) 3. Interpolation / std levels (ERA Levels) 4. Variable conversion (td, vp, dpd) 5. Final Quality Control Parameters ---------- data pd.DataFrame Profile Data in Database format interpolate bool Interpolate to levels (default: era_plevels) method str saturation water vapor pressure formulation levels list pressure levels for interpolation init_replace bool First quality check repalce? replace bool replace flagged values with NAN daynight bool use only: 00 and 12 UTc verbose int verboseness kwargs dict Returns ------- pd.DataFrame Raises ------ """ from raso.config import era_plevels from qc import control from dpd_anomaly import dpd_anomaly from interpolation import interp_dataframe from standard_dates_times import standard_dates_times funcid = "[STD] Data " if not isinstance(data, pd.DataFrame): raise ValueError(funcid + "Requires a DataFrame Database style") start = set_starttime() ############################################################################ # # Create a private copy and modify it # data = data.copy() # active copy ############################################################################ # # QC ( replace with NAN) # data = control(data, replace=init_replace, report=report, verbose=verbose - 1) # # DPD 30 Detection # if 'dpd' in data.columns: # for IGRA ? data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report, verbose=verbose - 1) # # Standard dates, times, remove duplicates!! # standard_dates_times(data, inplace=True, report=report, verbose=verbose - 1) journal(funcid + 'Standard Dates-Times (%s)' % str(data.shape), report, verbose) # if daynight: itime = data.index.hour * 100 + data.index.minute itx = (itime == 0) | (itime == 1200) # including minutes data = data.ix[itx, :].copy() # Limit to 0 and 12 journal(funcid + "Data [0, 12]: %s" % str(data.shape), report, verbose) ############################################################################ # # Convert humidity variable to relative humidity for interpolation # See manual standard_rel_humidity(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) ############################################################################ # # Standard levels / interpolate # take_era = False if levels is None: levels = era_plevels # a lot of levels ? take_era = True if interpolate: if int_vars is None: int_vars = ['p', 't', 'r'] data = interp_dataframe(data, variables=int_vars, levels=levels, report=report, verbose=verbose - 1) data = data.query('orig>0').copy() # .drop('orig',1) # Only Standard Levels else: cmd = "|".join(["p==%d" % ip for ip in levels]) data = data.query(cmd) journal(funcid + 'Level selection (%d), ERA (%s)' % (len(levels), take_era), report, verbose) ############################################################################ # # Final Quality Control # data = control(data, replace=replace, report=report, verbose=verbose - 1) ############################################################################ # # Convert RH to vp, dpd # standard_water_vapor(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) # standard_dewpoint_depression(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1) # # DPD 30 Detection # if 'dpd' in data.columns: data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report, verbose=verbose - 1) ############################################################################ if verbose > 0: print_time(start) return data
def merge_interpolate(data1, data2, variables=None, dropna=True, min_count=5, standard_time=True, d2name='era', report=None, verbose=0): """ Interpolate and Merge two datasets: 1. convert to standard times (0,6,12,18) 2. Merge Datasets 3. Vertical logp interpolation Interpolation levels are the union of both datasets Parameters ---------- data1 DataFrame ['p', vars ... ] data2 DataFrame ['p', vars ... ] variables list list of variables to use dropna bool remove missing values before interpolation min_count int minimum values per profile standard_times bool convert data to standard times Returns ------- newdata DataFrame ['p', vars ... ] Raises ------ ValueError not a pandas DataFrame RuntimeError missing p column """ from interpolation import interp_profile funcid = "[M] " if not isinstance(data1, pd.DataFrame) or not isinstance(data2, pd.DataFrame): raise ValueError(funcid + "Requires a pandas DataFrame as input") if 'p' not in data1.columns or 'p' not in data2.columns: raise RuntimeError(funcid + "Missing pressure column: p") # check variables ... if variables is not None: # check if p,t,r are in variables? if 'p' not in variables: if isinstance(variables, str): variables = list(variables) variables.append('p') data1 = data1[variables].copy() data2 = data2[variables].copy() journal(funcid + "Subsetting ... %s" % ",".join(variables), report, verbose) data1.index.name = 'date' data2.index.name = 'date' # match indices & merge (outer -> add both sides if standard_time: data1 = standard_dates_times(data1) # 0,6,12,18 else: data1 = data1.copy() # rename era variables and make sure we have floats!! data2 = data2.astype('float64').rename(columns=lambda x: x + "_%s" % d2name).rename( columns={'p_%s' % d2name: 'p'}).reset_index() # add marker columns !? data2['orig_%s' % d2name] = 1 data1['orig_raso'] = 1 journal(funcid + " Merging %s with %s on date and p" %(str(data1.shape), str(data2.shape)), report, verbose) # Merge # CHECK ? no data is lost ? alldata = pd.merge(data1.reset_index(), data2, left_on=['date', 'p'], right_on=['date', 'p'], how='outer').set_index('date', drop=True) # 0 raw, 1 both, 2 era alldata['source'] = np.where(np.isfinite(alldata['orig_%s' % d2name]), 2, 0) # set ERA to 2 alldata['source'] = np.where((np.isfinite(alldata['orig_raso'])) & (alldata['source'] == 2), alldata['source'] - 1, alldata['source']) # set RASO+ERA to 1 alldata.drop(['orig_era', 'orig_raso'], 1, inplace=True) alldata['source'] = alldata['source'].astype(str) if 'orig' in alldata.columns: alldata['orig'] = np.int_(alldata['orig'].values) alldata['orig'] = alldata['orig'].astype(str) # every profile / define wrapper function # pout -> raso p-levels # pin -> era p-levels # data -> only era data mod_interp = lambda x: interp_profile(x, pout=x['p'].values, pcolumn='p', dropna=dropna, min_values=min_count) journal(funcid+ "Variables: %s" % ",".join(alldata.columns), report, verbose) journal(funcid + " Interpolating %s (NA: %s, Min: %d)" % (str(alldata.shape), dropna, min_count), report, verbose) newdata = alldata.groupby(alldata.index).apply(mod_interp) # drop ... ? newdata = newdata.reset_index().drop('level_1', axis=1) # Multi-index to long / sort newdata = newdata.sort_values(by=['date', 'p']).set_index('date', drop=True) alldata['source'] = alldata['source'].astype(int) if 'orig' in alldata.columns: alldata['orig'] = alldata['orig'].astype(int) journal(funcid + " finished: %s > %s" % (str(data1.shape), str(newdata.shape)), report, verbose) return newdata