Example #1
0
def merge(data, attach, standard_times=True, suffix='era', report=None, verbose=0):
    """Merge data and attach to one dataframe
    Matching criteria are date and p
    return dataframe with merged columns

    Parameters
    ----------
    data            DataFrame       Original Data
    attach          DataFrame       Data that should be merged with Original
    standard_times  bool            Consider only 0 and 12 UTC for merging
    suffix          str             Suffix for Columns
    verbose         int             verboseness
    """
    # convert sonde data to standard_times to match better with ERA-Interim data?
    if 'p' not in data.columns or 'p' not in attach.columns:
        raise ValueError("Requires p in dataframes")

    # print_verbose("Matching levels: %d"%(np.sum( np.in1d( data.p.unique(),attach.p.unique())) ) ,verbose)

    # Match Index names
    data.index.name = 'date'
    attach.index.name = 'date'

    if standard_times:
        data = standard_dates_times(data, report=report, verbose=verbose)
        journal('[MERGE] Data with standard times: %s' % str(data.shape), report, verbose)

    if '_' not in suffix[0]:
        suffix = '_' + suffix

    return pd.merge(data.reset_index(),
                    attach.rename(columns=lambda x: x + suffix).reset_index(),
                    left_on=['date', 'p'],
                    right_on=['date', 'p'+suffix],
                    how='left').drop('p'+suffix, 1).set_index('date', drop=True)
Example #2
0
def standard_data(data, interpolate=True, int_vars=None, method='murphy_koop', levels=None, init_replace=True,
                  replace=False, daynight=True, report=None, verbose=0, **kwargs):
    """Standardize radiosonde data
    1. Quality Control
    2. Datetime fix (0/12 Soundings)
    3. Interpolation / std levels (ERA Levels)
    4. Variable conversion (td, vp, dpd)
    5. Final Quality Control

    Parameters
    ----------
    data            pd.DataFrame    Profile Data in Database format
    interpolate     bool            Interpolate to levels (default: era_plevels)
    method          str             saturation water vapor pressure formulation
    levels          list            pressure levels for interpolation
    init_replace    bool            First quality check repalce?
    replace         bool            replace flagged values with NAN
    daynight        bool            use only: 00 and 12 UTc
    verbose         int             verboseness
    kwargs          dict

    Returns
    -------
    pd.DataFrame

    Raises
    ------

    """
    from raso.config import era_plevels
    from qc import control
    from dpd_anomaly import dpd_anomaly
    from interpolation import interp_dataframe
    from standard_dates_times import standard_dates_times

    funcid = "[STD] Data "
    if not isinstance(data, pd.DataFrame):
        raise ValueError(funcid + "Requires a DataFrame Database style")
    start = set_starttime()
    ############################################################################
    #
    # Create a private copy and modify it
    #
    data = data.copy()  # active copy
    ############################################################################
    #
    # QC ( replace with NAN)
    #
    data = control(data, replace=init_replace, report=report, verbose=verbose - 1)
    #
    # DPD 30 Detection
    #
    if 'dpd' in data.columns:
        # for IGRA ?
        data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace,
                           report=report, verbose=verbose - 1)
    #
    # Standard dates, times, remove duplicates!!
    #
    standard_dates_times(data, inplace=True, report=report, verbose=verbose - 1)
    journal(funcid + 'Standard Dates-Times (%s)' % str(data.shape), report, verbose)
    #
    if daynight:
        itime = data.index.hour * 100 + data.index.minute
        itx = (itime == 0) | (itime == 1200)  # including minutes
        data = data.ix[itx, :].copy()  # Limit to 0 and 12
        journal(funcid + "Data [0, 12]: %s" % str(data.shape), report, verbose)
    ############################################################################
    #
    # Convert humidity variable to relative humidity for interpolation
    # See manual
    standard_rel_humidity(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    ############################################################################
    #
    # Standard levels / interpolate
    #
    take_era = False
    if levels is None:
        levels = era_plevels   #  a lot of levels ?
        take_era = True

    if interpolate:
        if int_vars is None:
            int_vars = ['p', 't', 'r']
        data = interp_dataframe(data, variables=int_vars, levels=levels, report=report, verbose=verbose - 1)
        data = data.query('orig>0').copy()  # .drop('orig',1) # Only Standard Levels

    else:
        cmd = "|".join(["p==%d" % ip for ip in levels])
        data = data.query(cmd)
        journal(funcid + 'Level selection (%d), ERA (%s)' % (len(levels), take_era), report, verbose)
    ############################################################################
    #
    # Final Quality Control
    #
    data = control(data, replace=replace, report=report, verbose=verbose - 1)
    ############################################################################
    #
    # Convert RH to vp, dpd
    #
    standard_water_vapor(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    #
    standard_dewpoint_depression(data, inplace=True, replace=replace, method=method, report=report, verbose=verbose - 1)
    #
    # DPD 30 Detection
    #
    if 'dpd' in data.columns:
        data = dpd_anomaly(data, var='dpd', num_years=10, before_year='1994', replace=replace, report=report,
                           verbose=verbose - 1)
    ############################################################################
    if verbose > 0:
        print_time(start)
    return data
Example #3
0
def standard_dewpoint_depression(data, tvar='t', dpdvar='dpd', vpvar='vp', tdvar='td', update=False, replace=False,
                                 method='murphy_koop', inplace=False, report=None, verbose=0, **kwargs):
    """ Convert water vapor pressure to dewpoint depression

    Parameters
    ----------
    data        pd.DataFrame    Input Radiosonde Data
    tvar        str             Temperature variable to use: t, t_cor
    dpdvar      str             Dewpoint Dep. variable to use: dpd, dpd_mcor
    vpvar       str             Water vapor pressure: vp
    method      str             saturation water vapor formulation
    update      bool            update dpd
    inplace     bool            apply directly to input / no copy ?
    verbose     int             verbosness
    kwargs      dict            **

    Returns
    -------
    pd.DataFrame / same as input (add columns)
    """
    from raso.met.conversion import dewpoint
    dpd_absmax = 60.
    ############################################################################
    funcid = "[SV] "

    if not isinstance(data, (pd.DataFrame, pd.Panel)):
        raise ValueError(funcid + "Requires a Dataframe or Panel")

    if not inplace:
        data = data.copy()

    if not hasnames(data,'qual'):
        data['qual'] = ''

    if not hasnames(data, vpvar):
        raise ValueError(funcid + " Requires a humidity variable: vp")

    if not hasnames(data, tvar):
        raise ValueError(funcid + " Requires variable: t")

    ############################################################################
    #
    # Dewpoint / for IGRA mostly
    #
    if not hasnames(data, dpdvar) or update:
        dpd = data[tvar] - dewpoint(data[vpvar], method=method)
        if hasnames(data, dpdvar):
            logic = (np.isfinite(dpd) & ~np.isfinite(data[dpdvar].values))  # Update? GOOD, BAD
            data[dpdvar] = np.where(logic, dpd, data[dpdvar].values)  # Update? NEW, OLD
            data[tdvar] = data[tvar].values - np.where(logic, dpd, data[dpdvar].values)  # Update? NEW, OLD
        else:
            data[dpdvar] = dpd
            data[tdvar] = data[tvar].values - dpd

        journal(funcid + "DPD (%s, %s) from vp (%s), replace: %s" % (dpdvar, tvar, vpvar, replace), report, verbose)
        #
        # DPD valid range ?
        #
        logic = ((data[dpdvar].values < 0) | (data[dpdvar].values > dpd_absmax))  # BAD, GOOD
        if replace:
            data[dpdvar] = np.where(logic, np.nan, data[dpdvar].values)  # set dpd 
            data[vpvar] = np.where(logic, np.nan, data[vpvar].values)  # set vp
            data[tdvar] = np.where(logic, np.nan, data[tdvar].values)  # set Td

        data['qual'] = np.where(logic, data.qual.values + 'D', data.qual.replace('D', '').values)  # FLAG: D
        #
        # Dewpoint larger than Temperature!
        #
        logic = (data[tdvar].values > data[tvar].values)  # BAD, GOOD
        if replace:
            data[dpdvar] = np.where(logic, np.nan, data[dpdvar].values)  # set dpd to small value
            data[tdvar] = np.where(logic, np.nan, data[tdvar].values)  # set Td to small value
            data[vpvar] = np.where(logic, np.nan, data[vpvar].values)  # set vp

        data['qual'] = np.where(logic, data.qual.values + 'Y', data.qual.replace('Y', '').values)  # FLAG: D

        ############################################################################
        #
        # Unique Flags
        #
        data['qual'] = unique_flags(data['qual'])

    if not inplace:
        return data
Example #4
0
def standard_water_vapor(data, tvar='t', dpdvar='dpd', vpvar='vp', rvar='r', qvar='q', update=False, replace=False,
                         method='murphy_koop', inplace=False, report=None, verbose=0, **kwargs):
    """ Convert humidity variables to water vapor pressure
    1. RH (Esat)
    2. Q  (P)
    3. DPD (Esat)
    --> VP

    Limits are from the RTTOV Coefficient file table 54 levels

    Notes
    -----
    http://nwpsaf.eu/oldsite/deliverables/rtm/rttov11_coefficients.html#54L_reg_limits

    Parameters
    ----------
    data        DataFrame       Input database
    tvar        str
    dpdvar      str
    vpvar       str
    rvar        str
    qvar        str
    update      bool
    esat        str
    inplace     bool
    verbose     int
    kwargs      **

    Returns
    -------
    data
    """
    from raso.met.conversion import sh2vap
    from raso.met import esat_functions
    from raso.qc import profile_limits
    ############################################################################
    rt = profile_limits(tohpa=True, simple_names=True)  # RTTOV Variable Limits
    rt['p'] *= 100.         # hPa to Pa
    rt['vpmin'] *= 100.     # hPa to Pa
    rt['vpmax'] *= 100.     # hPa to Pa
    ############################################################################
    funcid = "[SV] "

    if not isinstance(data, (pd.DataFrame, pd.Panel)):
        raise ValueError(funcid + "Requires a Dataframe or Panel")

    if not inplace:
        data = data.copy()

    if not hasnames(data, 'qual'):
        data['qual'] = ''

    if hasnames(data, [rvar, qvar, dpdvar], value=0):
        raise ValueError(funcid + " Requires a humidity variable: r, q or dpd")

    if hasnames(data, tvar, value=0):
        raise ValueError(funcid + " Requires variable: t")

    vpfunc = getattr(esat_functions, method)

    if not hasnames(data, vpvar) or update:
        remove_pressure = False
        if isinstance(data, pd.Panel):
            data['p'] = 0.  # add minor_axis as p
            data.loc['p', :, :] = np.asarray(data.minor_axis)[np.newaxis, np.newaxis, :]
            remove_pressure = True
            ndates = data.shape[1] * data.shape[2]
        else:
            ndates = data.shape[0]
        ############################################################################
        # First use R, which includes Q
        # R
        #
        if hasnames(data, rvar):
            journal(funcid + "Using r (%s) and t (%s) for vp (%s)" % (rvar, tvar, vpvar), report, verbose)
            data[vpvar] = data[rvar].values * vpfunc(data[tvar].values)  # Convert r,t to vp
        ############################################################################
        #
        # Q
        #
        elif hasnames(qvar, data):
            journal(funcid + "Using q (%s) and p to fill up gaps for vp (%s)" % (qvar, vpvar), report, verbose)
            vp = sh2vap(data[qvar].values, data['p'].values)  # Convert q,p to vp
            if hasnames(data, vpvar):
                logic = (np.isfinite(vp) & ~np.isfinite(data[vpvar].values))
                data[vpvar] = np.where(logic, vp, data[vpvar].values)
            # TODO missing else
                # data.loc[:, vpvar] = np.where(logic, vp, data[vpvar].values)
                # data.loc[:, 'qual'] = np.where(logic, data.qual.values + 'Q', data.qual.values)  # FLAG: F fill
        ############################################################################
        #
        # Second use DPD (IGRA)
        # DPD
        elif hasnames(data, dpdvar):
            journal(funcid + "Warning using dpd (%s) for vp (%s)" % (dpdvar, vpvar), report, verbose)
            vp = vpfunc((data[tvar] - data[dpdvar]).values)  # Convert Td to vp

            if hasnames(data, vpvar):
                logic = (np.isfinite(vp) & ~np.isfinite(data[vpvar].values))  # Update? GOOD, BAD
                data[vpvar] = np.where(logic, vp, data[vpvar].values)  # Update? NEW, OLD

            else:
                data[vpvar] = vp
        else:
            raise RuntimeError("No humidity variable found!")
        ############################################################################
        #
        # Quality control
        #
        vpmins = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmin.values, left=rt.vpmin.min(),
                           right=rt.vpmin.max())  # Interpolate Minimum
        vpmaxs = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmax.values, left=rt.vpmax.min(),
                           right=rt.vpmax.max())  # Interpolate Maximum
        # Range? BAD, GOOD
        logic = ((data[vpvar].values < vpmins) | (data[vpvar].values > vpmaxs)) & np.isfinite(data[vpvar].values)
        data['qual'] = np.where(logic, data.qual.values + 'V', data.qual.replace('V', '').values)  # FLAG: V
        # data.loc[:, 'qual'] = np.where(logic, data.qual.values + 'V', data.qual.replace('V', '').values)  # FLAG: V
        journal(funcid + "#%8d V flagged. (%d)" % (np.sum(np.sum(flag_inside(data.qual, 'V'))), ndates), report,
                verbose)

        if replace:
            data[vpvar] = np.where(logic, np.nan, data[vpvar].values)  # Apply? BAD, GOOD
            # data.loc[:, vpvar] = np.where(logic, np.nan, data[vpvar].values)  # Apply? BAD, GOOD
        ############################################################################
        #
        # Unique Flags
        #
        data['qual'] = unique_flags(data['qual'])
        ############################################################################
        #
        # Pressure
        #
        if remove_pressure:
            del data['p']

    if not inplace:
        return data
Example #5
0
def standard_rel_humidity(data, rvar='r', tvar='t', dpdvar='dpd', qvar='q', update=False, replace=False,
                          method='murphy_koop', inplace=False, report=None, verbose=0):
    """ convert humidity variables to relative humidity
    1. q to vp
    2. dpd to vp
    3. merge vp
    4. vp to rh

    set quality flags

    Parameters
    ----------
    data        DataFrame   Input database
    rvar        str
    tvar        str
    dpdvar      str
    qvar        str
    update      bool        Update existing
    replace     bool        set flagged to nan
    method      str         Saturation water vapor
    inplace     bool
    verbose     int
    kwargs      **

    Returns
    -------
    data
    """
    from raso.met.conversion import sh2vap
    from raso.met import esat_functions
    from raso.qc import profile_limits
    ############################################################################
    rt = profile_limits(tohpa=True, simple_names=True)  # RTTOV Variable Limits
    rt['p'] *= 100.         # hPa to Pa
    rt['vpmin'] *= 100.     # hPa to Pa
    rt['vpmax'] *= 100.     # hPa to Pa
    ############################################################################
    funcid = "[SV] "
    r_absmin = 0
    r_absmax = 1
    if not isinstance(data, (pd.DataFrame, pd.Panel)):
        raise ValueError(funcid + "Requires a Dataframe or Panel")

    if not inplace:
        data = data.copy()

    if not hasnames('qual', data):
        data['qual'] = ''

    if hasnames(data, [rvar, qvar, dpdvar], value=0):
        raise ValueError(funcid + " Requires a humidity variable: %s, %s or %s" % (rvar, qvar, dpdvar))

    if hasnames(data,tvar, value=0):
        raise ValueError(funcid + " Requires variable: %s" % tvar)

    vpfunc = getattr(esat_functions, method)

    if not hasnames(data, rvar) or update:
        remove_pressure = False
        if isinstance(data, pd.Panel):
            data['p'] = 0.  # add minor_axis as p
            data.loc['p', :, :] = np.asarray(data.minor_axis)[np.newaxis, np.newaxis, :]
            remove_pressure = True

        journal(funcid + "rel. humidity (%s) update: %s" % (rvar, update), report, verbose)
        # Quality control
        vpmins = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmin.values, left=rt.vpmin.min(),
                           right=rt.vpmin.max())  # Interpolate Minimum
        vpmaxs = np.interp(np.log(data.p.values), np.log(rt.p.values), rt.vpmax.values, left=rt.vpmax.min(),
                           right=rt.vpmax.max())  # Interpolate Maximum
        vpsat = vpfunc(data[tvar].values)
        if hasnames(data, rvar):
            vp = data[rvar].values * vpsat  # Convert r,t to vp
            logic = ((vp < vpmins) | (vp > vpmaxs)) & np.isfinite(vp)
            data['qual'] = np.where(logic, data.qual.values + 'R', data.qual.replace('R', '').values)  # FLAG: R (?)
            journal(funcid + "rel. humidity (%s) available (replace: %s)" % (rvar, replace), report, verbose)
            if replace:
                vp = np.where(logic, np.nan, vp)  # Apply? BAD, GOOD

        else:
            vp = np.full(data[tvar].shape, np.nan)

        if hasnames(data, qvar):
            qvp = sh2vap(data[qvar].values, data['p'].values)  # only a formula no approximation
            logic = ((qvp < vpmins) | (qvp > vpmaxs)) & np.isfinite(qvp)
            data['qual'] = np.where(logic, data.qual.values + 'Q', data.qual.replace('Q', '').values)  # FLAG: R (?)
            journal(funcid + "spec. humidity (%s) available (replace: %s)" % (qvar, replace), report, verbose)
            if replace:
                qvp = np.where(logic, np.nan, qvp)  # Apply? BAD, GOOD

            # Fill Gaps
            logic = (np.isfinite(qvp) & (~np.isfinite(vp)))  # GOOD, BAD
            vp = np.where(logic, qvp, vp)  # UPDATE, OLD

        if hasnames(data, dpdvar):
            dvp = vpfunc((data[tvar] - data[dpdvar]))  # Dewpoint -> vp
            logic = ((dvp < vpmins) | (dvp > vpmaxs)) & np.isfinite(dvp)
            data['qual'] = np.where(logic, data.qual.values + 'D', data.qual.replace('D', '').values)  # FLAG: R (?)
            journal(funcid + "DPD (%s) available (replace: %s)" % (dpdvar, replace), report, verbose)
            if replace:
                dvp = np.where(logic, np.nan, dvp)  # Apply? BAD, GOOD

            # fill gaps
            logic = (np.isfinite(dvp) & (~np.isfinite(vp)))  # GOOD, BAD
            vp = np.where(logic, dvp, vp)  # UPDATE, OLD
        ############################################################################
        #
        # Convert VP to RH
        #
        data[rvar] = vp / vpsat  # rel. Humidity
        logic = ((data[rvar].values < r_absmin) | (data[rvar].values > r_absmax)) & np.isfinite(
            data[rvar].values)  # Range? BAD, GOOD
        data['qual'] = np.where(logic, data.qual.values + 'R', data.qual.replace('R', '').values)  # FLAG: R

        if replace:
            data[rvar] = np.where(logic, np.nan, data.r.values)  # Apply? BAD, GOOD
        ############################################################################
        #
        # Unique Flags
        #
        data['qual'] = unique_flags(data['qual'])
        ############################################################################
        #
        # Pressure
        #
        if remove_pressure:
            del data['p']

    if not inplace:
        return data
def standard_dates_times(data, keep=False, inplace=False, night_noon=False, report=None, verbose=0, **kwargs):
    """Fix datetime index to standard sounding times (0, 6, 12, 18) pm 3h

    Parameters
    ----------
    data        pd.DataFrame        Radiosonde data, requireing a datetime index
    keep        bool                attach old index?
    inplace     bool                apply directly to input / no copy ?
    night_noon  bool                Select only 00 and 12 UTC
    verbose     int                 verbosness
    kwargs      dict                **

    Returns
    -------
    pd.DataFrame / same as input
    """
    funcid = "[SDT] "
    data.index.name = 'date'
    itime = data.index.hour*100 + data.index.minute  # 1200 oder 0 oder 600

    # Pre-select only relevant cases
    if night_noon:
        itx = (itime == 0) | (itime == 1200)
    else:
        itx = (itime == 0) | (itime == 600) | (itime == 1200) | (itime == 1800)

    if itx.size == 0:
        return data

    rx = data.index[~itx]

    if night_noon:
        rx = map(_fix_datetime_night_noon, rx)  # APPLY FUNCTION
    else:
        rx = map(_fix_datetime, rx)             # APPLY FUNCTION

    if not inplace:
        data = data.copy()

    data['newdate'] = data.index  # we have date and newdate !
    data['old_index'] = data['newdate'].copy()  # strings ?
    data.loc[~itx, 'newdate'] = rx

    # check for duplicates
    idouble = data['newdate'] != data['old_index']  # what we changed
    itimes = data.ix[idouble, 'newdate'].unique()  # only these dates

    if len(itimes) > 0:
        # only dates are important now
        justdates = data.loc[data.newdate.isin(itimes), ['newdate']]
        justdates.index.name = 'old_index'
        justdates = justdates.reset_index()  # index, date, newdate

        # select all dates and group them. Check if the old index has multiple dates. If yes split again.
        counts = justdates.groupby('newdate').apply(lambda x: len(x['old_index'].unique()))
        duplicates = counts[counts > 1].index  # How many duplicates are there?
        journal(funcid + "Changed: %d, Duplicates: %d" % (len(itimes), len(duplicates)), report, verbose)

        if len(duplicates) > 0:
            justdates = justdates[justdates.newdate.isin(duplicates)] # Selection
            justdates['tdiff'] = justdates['newdate'] - justdates['old_index']
            justdates['tdiff'] /= np.timedelta64(1, 'h')  # normalize to hours

            justdates = justdates.groupby('newdate').apply(fix_dates)
            justdates = justdates.set_index('old_index')

            data.loc[data.newdate.isin(duplicates), 'newdate'] = justdates['newdate']  # set back

    # general duplication check, because we can have real duplicates!!!
    if inplace:
        n = data.shape[0]
        data.drop_duplicates(subset=['newdate','p'], inplace=True)
        n2 = data.shape[0]

    else:
        n = data.shape[0]
        data = data.drop_duplicates(subset=['newdate','p'])
        n2 = data.shape[0]

    if n2 != n:
        journal(funcid + "General Duplicates removed: %d"%(n - n2), report, verbose)

    if night_noon:
        # calculate time difference between old and new index
        data['delta_t'] = data['newdate'] - data['old_index']

    data.set_index('newdate', inplace=True)
    data.index.name = 'date'  # fix name
    if not keep:
        del data['old_index']

    if not inplace:
        return data
Example #7
0
def merge_interpolate(data1, data2, variables=None, dropna=True, min_count=5, standard_time=True, d2name='era',
                      report=None, verbose=0):
    """
    Interpolate and Merge two datasets:
    1. convert to standard times (0,6,12,18)
    2. Merge Datasets
    3. Vertical logp interpolation

    Interpolation levels are the union of both datasets

    Parameters
    ----------
    data1             DataFrame     ['p', vars ... ]
    data2             DataFrame     ['p', vars ... ]
    variables         list          list of variables to use
    dropna            bool          remove missing values before interpolation
    min_count         int           minimum values per profile
    standard_times    bool          convert data to standard times

    Returns
    -------
    newdata           DataFrame     ['p', vars ... ]

    Raises
    ------
    ValueError     not a pandas DataFrame
    RuntimeError   missing p column
    """
    from interpolation import interp_profile

    funcid = "[M] "
    if not isinstance(data1, pd.DataFrame) or not isinstance(data2, pd.DataFrame):
        raise ValueError(funcid + "Requires a pandas DataFrame as input")

    if 'p' not in data1.columns or 'p' not in data2.columns:
        raise RuntimeError(funcid + "Missing pressure column: p")

    # check variables ...
    if variables is not None:

        # check if p,t,r are in variables?
        if 'p' not in variables:

            if isinstance(variables, str):
                variables = list(variables)

            variables.append('p')

        data1 = data1[variables].copy()
        data2 = data2[variables].copy()
        journal(funcid + "Subsetting ... %s" % ",".join(variables), report, verbose)

    data1.index.name = 'date'
    data2.index.name = 'date'

    # match indices & merge (outer -> add both sides
    if standard_time:
        data1 = standard_dates_times(data1)  # 0,6,12,18
    else:
        data1 = data1.copy()

    # rename era variables and make sure we have floats!!
    data2 = data2.astype('float64').rename(columns=lambda x: x + "_%s" % d2name).rename(
        columns={'p_%s' % d2name: 'p'}).reset_index()

    # add marker columns !?
    data2['orig_%s' % d2name] = 1
    data1['orig_raso'] = 1

    journal(funcid + " Merging %s with %s on date and p" %(str(data1.shape), str(data2.shape)), report, verbose)
    # Merge
    # CHECK ? no data is lost ?
    alldata = pd.merge(data1.reset_index(),
                       data2,
                       left_on=['date', 'p'],
                       right_on=['date', 'p'],
                       how='outer').set_index('date', drop=True)

    # 0 raw, 1 both, 2 era
    alldata['source'] = np.where(np.isfinite(alldata['orig_%s' % d2name]), 2, 0)  # set ERA to 2
    alldata['source'] = np.where((np.isfinite(alldata['orig_raso'])) & (alldata['source'] == 2), alldata['source'] - 1,
                                 alldata['source'])  # set RASO+ERA to 1

    alldata.drop(['orig_era', 'orig_raso'], 1, inplace=True)

    alldata['source'] = alldata['source'].astype(str)
    if 'orig' in alldata.columns:
        alldata['orig'] = np.int_(alldata['orig'].values)
        alldata['orig'] = alldata['orig'].astype(str)

    # every profile / define wrapper function
    # pout -> raso p-levels
    # pin  -> era p-levels
    # data -> only era data
    mod_interp = lambda x: interp_profile(x, pout=x['p'].values, pcolumn='p', dropna=dropna, min_values=min_count)
    journal(funcid+ "Variables: %s" % ",".join(alldata.columns), report, verbose)
    journal(funcid + " Interpolating %s (NA: %s, Min: %d)" % (str(alldata.shape), dropna, min_count), report, verbose)

    newdata = alldata.groupby(alldata.index).apply(mod_interp)
    # drop ... ?
    newdata = newdata.reset_index().drop('level_1', axis=1)

    # Multi-index to long / sort
    newdata = newdata.sort_values(by=['date', 'p']).set_index('date', drop=True)

    alldata['source'] = alldata['source'].astype(int)

    if 'orig' in alldata.columns:
        alldata['orig'] = alldata['orig'].astype(int)

    journal(funcid + " finished: %s > %s" % (str(data1.shape), str(newdata.shape)), report, verbose)

    return newdata
Example #8
0
def interp_dataframe(data, levels=None, variables=None, min_count=5, dropna=True, report=None, verbose=0):
    """
    Interpolate:
    1. Select only levels with enough (min_count) t and r values
    2. Interpolate each profile (date) vertically to levels

    Interpolation is only done at dates with enough data

    Args:
      data       pandas DataFrame with p

    Keyword Args:
      levels     [1000 ... 100000]   mod. ERA-Interim pressure levels
      variables  ['t','r']           Variables for quality check (min_count)
      min_count  [5]                 Minimum required levels per profile
      dropna     [True]              Remove missing values before interpolation
      verbose    [0]                 Show more info

    Returns:
      newdata    pandas DataFrame with newly interpolated values

    Raises:
      ValueError when p or other variables not inside data

    Calls:
      interp_profile -> interp_mod

    """
    from raso.config import std_plevels
    funcid = "[INTP] "

    if levels is None:
        levels = std_plevels

    if variables is None:
        variables = data.columns.tolist()

    # fix index name
    if data.index.name is None or data.index.name == '':
        data.index.name = 'date'

    index_name = data.index.name

    # check plevels vs plevels inside dataframe ?
    variables = list(set(variables + ['p']))  # add p
    variables = data.columns[data.columns.isin(variables)].tolist()  # inside ?

    if len(variables) < 1 or 'p' not in variables:
        raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables)))

    journal(funcid + "Quality check (%s) Min: %d NAN: %s" % (",".join(variables), min_count, color_boolean(dropna)),
            report, verbose)

    # sometimes one variables is completely zero
    counts = data.count()[variables]
    if (counts == 0).any():
        variables = counts[counts > 0].index.tolist()
        if len(variables) < 1 or 'p' not in variables:
            print counts
            raise ValueError(funcid + "Dataframe requires at least 2 columns(p,+) %s" % (",".join(variables)))

    itx = (data.groupby(data.index).count()[variables] > min_count).sum(1) > 2  # at least 2 variables have data

    if itx.sum() == 0:
        raise RuntimeError(funcid + "No data left with %d as minimum data count per profile" % min_count)
    else:
        data = data.ix[itx, :]

    journal(funcid + "from %d to %d" % (len(itx), itx.sum()), report, verbose)
    #
    data = data.groupby(data.index).apply(interp_profile, variables=variables, pout=levels, oan=True, dropna=dropna,
                                          min_values=min_count)

    data = data.reset_index().drop('level_1', axis=1).sort_values(by=[index_name, 'p']).set_index(index_name, drop=True)
    journal(funcid + "Done: %s" % str(data.shape), report, verbose)
    return data