Exemple #1
0
def merge(data, attach, standard_times=True, suffix='era', report=None, verbose=0):
    """Merge data and attach to one dataframe
    Matching criteria are date and p
    return dataframe with merged columns

    Parameters
    ----------
    data            DataFrame       Original Data
    attach          DataFrame       Data that should be merged with Original
    standard_times  bool            Consider only 0 and 12 UTC for merging
    suffix          str             Suffix for Columns
    verbose         int             verboseness
    """
    # convert sonde data to standard_times to match better with ERA-Interim data?
    if 'p' not in data.columns or 'p' not in attach.columns:
        raise ValueError("Requires p in dataframes")

    # print_verbose("Matching levels: %d"%(np.sum( np.in1d( data.p.unique(),attach.p.unique())) ) ,verbose)

    # Match Index names
    data.index.name = 'date'
    attach.index.name = 'date'

    if standard_times:
        data = standard_dates_times(data, report=report, verbose=verbose)
        journal('[MERGE] Data with standard times: %s' % str(data.shape), report, verbose)

    if '_' not in suffix[0]:
        suffix = '_' + suffix

    return pd.merge(data.reset_index(),
                    attach.rename(columns=lambda x: x + suffix).reset_index(),
                    left_on=['date', 'p'],
                    right_on=['date', 'p'+suffix],
                    how='left').drop('p'+suffix, 1).set_index('date', drop=True)
def standard_data(data, interpolate=True, int_vars=None, method='murphy_koop', levels=None, init_replace=True,
                  replace=False, daynight=True, report=None, verbose=0, **kwargs):
    """Standardize radiosonde data
    1. Quality Control
    2. Datetime fix (0/12 Soundings)
    3. Interpolation / std levels (ERA Levels)
    4. Variable conversion (td, vp, dpd)
    5. Final Quality Control

    Parameters
    ----------
    data            pd.DataFrame    Profile Data in Database format
    interpolate     bool            Interpolate to levels (default: era_plevels)
    method          str             saturation water vapor pressure formulation
    levels          list            pressure levels for interpolation
    init_replace    bool            First quality check repalce?
    replace         bool            replace flagged values with NAN
    daynight        bool            use only: 00 and 12 UTc
    verbose         int             verboseness
    kwargs          dict

    Returns
    -------
    pd.DataFrame

    Raises
    ------

    """
    from raso.config import era_plevels
    from qc import control
    from dpd_anomaly import dpd_anomaly
    from interpolation import interp_dataframe
    from standard_dates_times import standard_dates_times

    funcid = "[STD] Data "
    if not isinstance(data, pd.DataFrame):
        raise ValueError(funcid + "Requires a DataFrame Database style")
    start = set_starttime()
    ############################################################################
    #
    # Create a private copy and modify it
    #
    data = data.copy()  # active copy
    ############################################################################
    #
    # QC ( replace with NAN)
    #
    data = control(data, replace=init_replace, report=report, verbose=verbose - 1)
    #
    # DPD 30 Detection
    #
    if 'dpd' in data.columns:
        # for IGRA ?
        data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace,
                           report=report, verbose=verbose - 1)
    #
    # Standard dates, times, remove duplicates!!
    #
    standard_dates_times(data, inplace=True, report=report, verbose=verbose - 1)
    journal(funcid + 'Standard Dates-Times (%s)' % str(data.shape), report, verbose)
    #
    if daynight:
        itime = data.index.hour * 100 + data.index.minute
        itx = (itime == 0) | (itime == 1200)  # including minutes
        data = data.ix[itx, :].copy()  # Limit to 0 and 12
        journal(funcid + "Data [0, 12]: %s" % str(data.shape), report, verbose)
    ############################################################################
    #
    # Convert humidity variable to relative humidity for interpolation
    # See manual
    standard_rel_humidity(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    ############################################################################
    #
    # Standard levels / interpolate
    #
    take_era = False
    if levels is None:
        levels = era_plevels   #  a lot of levels ?
        take_era = True

    if interpolate:
        if int_vars is None:
            int_vars = ['p', 't', 'r']
        data = interp_dataframe(data, variables=int_vars, levels=levels, report=report, verbose=verbose - 1)
        data = data.query('orig>0').copy()  # .drop('orig',1) # Only Standard Levels

    else:
        cmd = "|".join(["p==%d" % ip for ip in levels])
        data = data.query(cmd)
        journal(funcid + 'Level selection (%d), ERA (%s)' % (len(levels), take_era), report, verbose)
    ############################################################################
    #
    # Final Quality Control
    #
    data = control(data, replace=replace, report=report, verbose=verbose - 1)
    ############################################################################
    #
    # Convert RH to vp, dpd
    #
    standard_water_vapor(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    #
    standard_dewpoint_depression(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    #
    # DPD 30 Detection
    #
    if 'dpd' in data.columns:
        data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report,
                           verbose=verbose - 1)
    ############################################################################
    if verbose > 0:
        print_time(start)
    return data
Exemple #3
0
def merge_interpolate(data1, data2, variables=None, dropna=True, min_count=5, standard_time=True, d2name='era',
                      report=None, verbose=0):
    """
    Interpolate and Merge two datasets:
    1. convert to standard times (0,6,12,18)
    2. Merge Datasets
    3. Vertical logp interpolation

    Interpolation levels are the union of both datasets

    Parameters
    ----------
    data1             DataFrame     ['p', vars ... ]
    data2             DataFrame     ['p', vars ... ]
    variables         list          list of variables to use
    dropna            bool          remove missing values before interpolation
    min_count         int           minimum values per profile
    standard_times    bool          convert data to standard times

    Returns
    -------
    newdata           DataFrame     ['p', vars ... ]

    Raises
    ------
    ValueError     not a pandas DataFrame
    RuntimeError   missing p column
    """
    from interpolation import interp_profile

    funcid = "[M] "
    if not isinstance(data1, pd.DataFrame) or not isinstance(data2, pd.DataFrame):
        raise ValueError(funcid + "Requires a pandas DataFrame as input")

    if 'p' not in data1.columns or 'p' not in data2.columns:
        raise RuntimeError(funcid + "Missing pressure column: p")

    # check variables ...
    if variables is not None:

        # check if p,t,r are in variables?
        if 'p' not in variables:

            if isinstance(variables, str):
                variables = list(variables)

            variables.append('p')

        data1 = data1[variables].copy()
        data2 = data2[variables].copy()
        journal(funcid + "Subsetting ... %s" % ",".join(variables), report, verbose)

    data1.index.name = 'date'
    data2.index.name = 'date'

    # match indices & merge (outer -> add both sides
    if standard_time:
        data1 = standard_dates_times(data1)  # 0,6,12,18
    else:
        data1 = data1.copy()

    # rename era variables and make sure we have floats!!
    data2 = data2.astype('float64').rename(columns=lambda x: x + "_%s" % d2name).rename(
        columns={'p_%s' % d2name: 'p'}).reset_index()

    # add marker columns !?
    data2['orig_%s' % d2name] = 1
    data1['orig_raso'] = 1

    journal(funcid + " Merging %s with %s on date and p" %(str(data1.shape), str(data2.shape)), report, verbose)
    # Merge
    # CHECK ? no data is lost ?
    alldata = pd.merge(data1.reset_index(),
                       data2,
                       left_on=['date', 'p'],
                       right_on=['date', 'p'],
                       how='outer').set_index('date', drop=True)

    # 0 raw, 1 both, 2 era
    alldata['source'] = np.where(np.isfinite(alldata['orig_%s' % d2name]), 2, 0)  # set ERA to 2
    alldata['source'] = np.where((np.isfinite(alldata['orig_raso'])) & (alldata['source'] == 2), alldata['source'] - 1,
                                 alldata['source'])  # set RASO+ERA to 1

    alldata.drop(['orig_era', 'orig_raso'], 1, inplace=True)

    alldata['source'] = alldata['source'].astype(str)
    if 'orig' in alldata.columns:
        alldata['orig'] = np.int_(alldata['orig'].values)
        alldata['orig'] = alldata['orig'].astype(str)

    # every profile / define wrapper function
    # pout -> raso p-levels
    # pin  -> era p-levels
    # data -> only era data
    mod_interp = lambda x: interp_profile(x, pout=x['p'].values, pcolumn='p', dropna=dropna, min_values=min_count)
    journal(funcid+ "Variables: %s" % ",".join(alldata.columns), report, verbose)
    journal(funcid + " Interpolating %s (NA: %s, Min: %d)" % (str(alldata.shape), dropna, min_count), report, verbose)

    newdata = alldata.groupby(alldata.index).apply(mod_interp)
    # drop ... ?
    newdata = newdata.reset_index().drop('level_1', axis=1)

    # Multi-index to long / sort
    newdata = newdata.sort_values(by=['date', 'p']).set_index('date', drop=True)

    alldata['source'] = alldata['source'].astype(int)

    if 'orig' in alldata.columns:
        alldata['orig'] = alldata['orig'].astype(int)

    journal(funcid + " finished: %s > %s" % (str(data1.shape), str(newdata.shape)), report, verbose)

    return newdata