Example #1
0
def test_4():
    m = random.choice(range(1,10))
    e = random.choice(range(4))
    n = m*10**e
    
    n |= 1 # ensure odd
    l = list(range(n))
    random.shuffle(l)

    # median is in this case also the max deviation.
    median = (n-1)/2
    assert math.ceil(median/2.0) == mad.mad(l)
Example #2
0
def test():
    result = {}
    func = {
        'sim_euc': similarity.sim_euc,
        'sim_man': similarity.sim_man,
        'sim_Lmax': similarity.sim_Lmax
    }
    for fun in func:  #For all three Metrics
        for k in [10, 50,
                  100]:  #check for three different k values(Handpicked)
            result.setdefault(fun, {})
            for i in [1, 2, 3, 4, 5]:  # 5 fold cross validation
                madScore = 0
                madlist = []
                print("Running", i, "Validation for function", fun,
                      "k value is", k)
                #print("In script")
                #print('./Datastore/rec_'+str(fun)+ '_'+str(k) + '_'+ str(i)+'.json')
                recEngineAlgo.getRecommendation(
                    i,
                    load.readPrefs('./Datastore/prefsTrain' + str(i) +
                                   '.json'),
                    load.readPrefs('./Datastore/movies.json'), k, func[fun])
                madScore = mad.mad(
                    load.readPrefs('./Datastore/rec_' + str(fun) + '_' +
                                   str(k) + '_' + str(i) + '.json'),
                    load.readPrefs('./Datastore/prefsTest' + str(i) + '.json'))
                print("Madscore for", i, "validation for function", fun,
                      "with k value", k, "is", madScore)
                madlist.append(madScore)

            for item in madlist:  #Taking average Mad
                madScore = madScore + item
                print("Final MadScore for func:", str(fun), "and k:", k, "is:",
                      round(madScore / 5, 2))
            result[fun][k] = round(madScore / 5, 2)
            print(result)

    with open('./Datastore/result.json',
              'w') as fp:  #storing the result in dictionary
        json.dump(result, fp)
Example #3
0
def _nee2gpp_reichstein(df, ff, isday, undef=-9999, nogppnight=False):
    """
    Calculate photosynthesis (GPP) and ecosystem respiration (RECO) from original
    Eddy flux data, using several fits of Reco vs. temperature of nighttime data
    over the season, as in Reichstein et al. (2005), in order to calculate Reco
    and then GPP = Reco - NEE.

    Parameters
    ----------
    df : pandas.Dataframe
        time series of CO2 fluxes and air temperature.

        pandas.Dataframe with the columns
        'FC' or 'NEE' (or starting with 'FC\_' or 'NEE\_') for observed CO2 flux [umol(CO2) m-2 s-1]
        'TA'    (or starting with 'TA\_') for air temperature [K]
        The index is taken as date variable.
    ff : pandas.Dataframe
        flag Dataframe or array has the same shape as `df`. Non-zero values in
        `ff` will be treated as missing values in `df`.

        `ff` must follow the same rules as `df`.
    isday : array_like of bool
        True when it is day, False when night. Must have the same length as `df.shape[0].`
    undef : float, optional
        values having `undef` value are treated as missing values in `df` (default: -9999)
    nogppnight : float, optional
        GPP will be set to zero at night. RECO will then equal NEE at night (default: False)

    Returns
    -------
    pandas.Dataframe
        pandas.Dataframe with two columns 'GPP' and 'RECO' with estimated
        photosynthesis and ecosystem respiration.

    Literature
    ----------
    Reichstein et al. (2005)
        On the separation of net ecosystem exchange into assimilation and ecosystem
        respiration: review and improved algorithm,
        Global Change Biology 11, 1424-1439

    Examples
    --------
    >>> from fread import fread
    >>> from date2dec import date2dec
    >>> from dec2date import dec2date
    >>> ifile = 'test_nee2gpp.csv'
    >>> undef = -9999.
    >>> dat   = fread(ifile, skip=2, transpose=True)
    >>> ndat  = dat.shape[1]
    >>> head  = fread(ifile, skip=2, header=True)
    >>> head1 = head[0]
    >>> # date
    >>> jdate = date2dec(dy=dat[0,:], mo=dat[1,:], yr=dat[2,:], hr=dat[3,:], mi=dat[4,:])
    >>> adate = dec2date(jdate, eng=True)
    >>> # colhead
    >>> idx   = []
    >>> for i in head1:
    ...     if i in ['NEE', 'rg', 'Tair', 'VPD']: idx.append(head1.index(i))
    >>> colhead = ['FC', 'SW_IN', 'TA', 'VPD']
    >>> # data
    >>> dfin = dat[idx,:]
    >>> dfin[2,:] = np.where(dfin[2,:] == undef, undef, dfin[2,:]+273.15)
    >>> dfin[3,:] = np.where(dfin[3,:] == undef, undef, dfin[3,:]*100.)
    >>> # flag
    >>> flag = np.where(dfin == undef, 2, 0)
    >>> # partition
    >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='local')
    >>> print(GPP[1120:1128])
    [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03  4.40606871e+00
      8.31942152e+00  1.06242542e+01  8.49245664e+00  1.12381973e+01]
    >>> print(Reco[1120:1128])
    [1.68311981 1.81012431 1.9874173  2.17108871 2.38759152 2.64372415
     2.90076664 3.18592735]

    >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='local')
    >>> print(GPP[1120:1128])
    [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03  4.40606871e+00
      8.31942152e+00  1.06242542e+01  8.49245664e+00  1.12381973e+01]

    >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='Reichstein')
    >>> print(GPP[1120:1128])
    [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03  4.40606871e+00
      8.31942152e+00  1.06242542e+01  8.49245664e+00  1.12381973e+01]

    >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='reichstein')
    >>> print(GPP[1120:1128])
    [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03  4.40606871e+00
      8.31942152e+00  1.06242542e+01  8.49245664e+00  1.12381973e+01]

    History
    -------
    Written  Matthias Cuntz, Mar 2012
    Modified Arndt Piayda,   Mar 2012 - undef=np.nan
             Matthias Cuntz, Nov 2012 - individual routine
             Matthias Cuntz, Feb 2013 - ported to Python 3
    """
    # Variables
    fc_id = ''
    for cc in df.columns:
        if cc.startswith('FC_') or (cc == 'FC') or cc.startswith('NEE_') or (
                cc == 'NEE'):
            fc_id = cc
            break
    ta_id = ''
    for cc in df.columns:
        if cc.startswith('TA_') or (cc == 'TA'):
            ta_id = cc
            break
    assert fc_id, 'Carbon net flux with name FC or NEE or starting with FC_ or NEE_ must be in input.'
    assert ta_id, 'Air temperature with name TA or starting with TA_ must be in input.'

    nee = np.ma.array(df[fc_id], mask=(ff[fc_id] > 0))
    t = np.ma.array(df[ta_id], mask=(ff[ta_id] > 0))
    misday = np.ma.array(isday,
                         mask=((~np.isfinite(isday)) | (isday == undef)))
    dates = df.index.to_julian_date()

    # Partition - Local relationship = Reichstein et al. (2005)

    ndata = nee.size
    GPP = np.ones(ndata) * undef
    Reco = np.ones(ndata) * undef
    dfout = pd.DataFrame({'GPP': GPP, 'RECO': Reco}, index=df.index)

    # Select valid nighttime
    mask = misday | nee.mask | t.mask | misday.mask
    ii = np.where(~mask)[0]
    if (ii.size == 0):
        # raise ValueError('Error _nee2gpp_reichstein: no valid nighttime data.')
        print('Warning _nee2gpp_reichstein: no valid nighttime data.')
        return dfout
    jul = dates[ii]
    tt = np.ma.compressed(t[ii])
    net = np.ma.compressed(nee[ii])
    # 1. each 5 days, in 15 day period, fit if range of T > 5
    locp = []  # local param
    locs = []  # local err
    dmin = np.floor(np.amin(jul)).astype(
        np.int)  # be aware that julian days starts at noon, i.e. 1.0 is 12h
    dmax = np.ceil(np.amax(jul)).astype(
        np.int
    )  # so the search will be from noon to noon and thus includes all nights
    for i in range(dmin, dmax, 5):
        iii = np.where((jul >= i) & (jul < (i + 14)))[0]
        niii = iii.size
        if niii > 6:
            tt1 = tt[iii]
            net1 = net[iii]
            mm = ~mad(net1, z=4.5)  # make fit more robust by removing outliers
            if (np.ptp(tt[iii]) >= 5.) & (np.sum(mm) > 6):
                p, temp1, temp2 = opt.fmin_tnc(cost_lloyd_fix, [2., 200.],
                                               bounds=[[0., None], [0., None]],
                                               args=(tt1[mm], net1[mm]),
                                               approx_grad=True,
                                               disp=False)
                try:
                    p1, c = opt.curve_fit(lloyd_fix,
                                          tt1[mm],
                                          net1[mm],
                                          p0=p,
                                          maxfev=10000)  # params, covariance
                    if np.all(np.isfinite(
                            c)):  # possible return of curvefit: c=inf
                        s = np.sqrt(np.diag(c))
                    else:
                        s = 10. * np.abs(p)
                except:
                    s = 10. * np.abs(p)
                locp += [p]
                locs += [s]
                # if ((s[1]/p[1])<0.5) & (p[1] > 0.): pdb.set_trace()
    if len(locp) == 0:
        # raise ValueError('Error _nee2gpp_reichstein: No local relationship found.')
        print('Warning _nee2gpp_reichstein: No local relationship found.')
        return dfout
    locp = np.squeeze(np.array(locp).astype(np.float))
    locs = np.squeeze(np.array(locs).astype(np.float))
    # 2. E0 = avg of best 3
    # Reichstein et al. (2005), p. 1430, 1st paragraph.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        iii = np.where((locp[:, 1] > 0.) & (locp[:, 1] < 450.)
                       & (np.abs(locs[:, 1] / locp[:, 1]) < 0.5))[0]
    niii = iii.size
    if niii == 0:
        # raise ValueError('Error _nee2gpp_reichstein: No good local relationship found.')
        # loosen the criteria: take the best three estimates anyway
        iii = np.where((locp[:, 1] > 0.))[0]
        niii = iii.size
        if niii < 1:
            # raise ValueError('Error _nee2gpp_reichstein: No E0>0 found.')
            print('Warning _nee2gpp_reichstein: No E0>0 found.')
            return dfout
        lp = locp[iii, :]
        ls = locs[iii, :]
        iis = np.argsort(ls[:, 1])
        bestp = np.mean(lp[iis[0:np.minimum(3, niii)], :], axis=0)
        bests = np.mean(ls[iis[0:np.minimum(3, niii)], :], axis=0)
    elif niii == 1:
        bestp = np.squeeze(locp[iii, :])
        bests = np.squeeze(locs[iii, :])
    elif niii == 2:
        bestp = np.mean(locp[iii, :], axis=0)
        bests = np.mean(locs[iii, :], axis=0)
        # ls    = locs[iii,:]
        # iis   = np.argsort(ls[:,1])
    else:
        lp = locp[iii, :]
        ls = locs[iii, :]
        iis = np.argsort(ls[:, 1])
        bestp = np.mean(lp[iis[0:3], :], axis=0)
        bests = np.mean(ls[iis[0:3], :], axis=0)

    # 3. Refit Rref with fixed E0, each 4 days
    refp = []  # Rref param
    refii = []  # mean index of data points
    E0 = bestp[1]
    et = lloyd_fix(tt, 1., E0)
    for i in range(dmin, dmax, 4):
        iii = np.where((jul >= i) & (jul < (i + 4)))[0]
        niii = iii.size
        if niii > 3:
            # Calc directly minisation of (nee-p*et)**2
            p, temp1, temp2 = opt.fmin_tnc(cost_abs, [2.],
                                           bounds=[[0., None]],
                                           args=(lloyd_only_rref_p, et[iii],
                                                 net[iii]),
                                           approx_grad=True,
                                           disp=False)
            refp += [p]
            refii += [np.int((iii[0] + iii[-1]) // 2)]
    if len(refp) == 0:
        # raise ValueError('Error _nee2gpp_reichstein: No ref relationship found.')
        print('Warning _nee2gpp_reichstein: No ref relationship found.')
        return dfout
    refp = np.squeeze(np.array(refp))
    refii = np.squeeze(np.array(refii))

    # 4. Interpol Rref
    Rref = np.interp(dates, jul[refii], refp)

    # 5. Calc Reco
    Reco = np.ones(ndata) * undef
    ii = np.where(~t.mask)[0]
    Reco[ii] = lloyd_fix(t[ii], Rref[ii], E0)

    # 6. Calc GPP
    GPP = np.ones(ndata) * undef
    ii = np.where(~(t.mask | nee.mask))[0]
    GPP[ii] = Reco[ii] - nee[ii]

    # 7. Set GPP=0 at night, if wanted
    if nogppnight:
        mask = misday | nee.mask | t.mask | misday.mask  # night
        ii = np.where(~mask)[0]
        Reco[ii] = nee[ii]
        GPP[ii] = 0.
        # and prohibit negative gpp at any time
        mask = nee.mask | t.mask | (GPP > 0.)
        ii = np.where(~mask)[0]
        Reco[ii] -= GPP[ii]
        GPP[ii] = 0.

    dfout = pd.DataFrame({'GPP': GPP, 'RECO': Reco}, index=df.index)

    return dfout
Example #4
0
def test_3():
    assert 1 == mad.mad([1,2,3])
Example #5
0
def madspikes(dfin,
              flag=None,
              isday=None,
              colhead=None,
              undef=-9999,
              nscan=15 * 48,
              nfill=1 * 48,
              z=7,
              deriv=2,
              swthr=10.,
              plot=False):
    """
    Spike detection for using a moving median absolute difference filter.
    Used with Eddy vovariance data in Papale et al. (Biogeosciences, 2006).

    Parameters
    ----------
    dfin : pandas.Dataframe or numpy.array
        time series of data where spike detection with MAD should be applied.

        `dfin` can be a pandas.Dataframe.

        `dfin` can also me a numpy array. In this case `colhead` must be given.
        MAD will be applied along axis=0, i.e. on each column of axis=1.
    flag : pandas.Dataframe or numpy.array, optional
        flag Dataframe or array has the same shape as dfin. Non-zero values in
        `flag` will be treated as missing values in `dfin`.

        If `flag` is numpy array, `df.columns.values` will be used as column heads.
    isday : array_like of bool, optional
        True when it is day, False when night. Must have the same length as dfin.shape[0].

        If `isday` is not given, `dfin` must have a column with head 'SW_IN' or
        starting with 'SW_IN'. `isday` will then be `dfin['SW_IN'] > swthr`.
    colhed : array_like of str, optional
        column names if `dfin` is numpy array.
    undef : float, optional
        values having `undef` value are treated as missing values in `dfin` (default: -9999)

        np.nan is not allowed (working).
    nscan : int, optional
        size of moving window to calculate mad in time steps (default: 15*48)
    nfill : int, optional
        step size of moving window to calculate mad in time steps (default: 1*48)

        mad will be calculated in `nscan` time window. Resulting mask will be applied
        only in `nfill` window in the middle of the `nscan` window. Then `nscan` window
        will be moved by `nfill` time steps.
    z : float, optional
        Input is allowed to deviate maximum `z` standard deviations from the median (default: 7)
    deriv : int, optional
        0: Act on raw input.

        1: Use first derivatives.

        2: Use 2nd derivatives (default).
    swthr : float, optional
        Threshold to determine daytime from incoming shortwave radiation if `isday` not given (default: 10).
    plot : bool, optional
        True: data and spikes are plotted into madspikes.pdf (default: False).

    Returns
    -------
    pandas.Dataframe or numpy array
        flags, 0 everywhere except detected spikes set to 2.

    History
    -------
    Written,    Matthias Cuntz & Tino Rau, 2008
    Maintained, Arndt Piayda,   Aug 2014
    Modified,   Matthias Cuntz, Apr 2020 - input can be pandas Dataframe or numpy array(s)
                                         - removed iteration
                Matthias Cuntz, May 2020 - numpy docstring format
    """
    # numpy or panda
    if isinstance(dfin, (np.ndarray, np.ma.MaskedArray)):
        isnumpy = True
        istrans = False
        assert colhead is not None, 'colhead must be given if input is numpy.ndarray.'
        if dfin.shape[0] == len(colhead):
            istrans = True
            df = pd.DataFrame(dfin.T, columns=colhead)
        elif dfin.shape[1] == len(colhead):
            df = pd.DataFrame(dfin, columns=colhead)
        else:
            raise ValueError(
                'Length of colhead must be number of columns in input array. len(colhead)='
                + str(len(colhead)) + ' shape(input)=(' + str(dfin.shape[0]) +
                ',' + str(dfin.shape[1]) + ').')
    else:
        isnumpy = False
        istrans = False
        assert isinstance(
            dfin, pd.core.frame.DataFrame
        ), 'Input must be either numpy.ndarray or pandas.DataFrame.'
        df = dfin.copy(deep=True)

    # Incoming flags
    if flag is not None:
        if isinstance(flag, (np.ndarray, np.ma.MaskedArray)):
            fisnumpy = True
            fistrans = False
            if flag.shape[0] == len(df):
                ff = pd.DataFrame(flag, columns=df.columns.values)
            elif flag.shape[1] == len(df):
                fistrans = True
                ff = pd.DataFrame(flag.T, columns=df.columns.values)
            else:
                raise ValueError(
                    'flag must have same shape as data array. data: ({:d},{:d}); flag: ({:d},{:d})'
                    .format(dfin.shape[0], dfin.shape[1], flag.shape[0],
                            flag.shape[1]))
            ff = ff.set_index(df.index)
        else:
            fisnumpy = False
            fistrans = False
            assert isinstance(
                flag, pd.core.frame.DataFrame
            ), 'Flag must be either numpy.ndarray or pandas.DataFrame.'
            ff = flag.copy(deep=True)
    else:
        fisnumpy = isnumpy
        fistrans = istrans
        # flags: 0: good; 1: input flagged; 2: output flagged
        ff = df.copy(deep=True).astype(int)
        ff[:] = 0
        ff[df == undef] = 1
        ff[df.isna()] = 1

    # day or night
    if isday is None:
        sw_id = ''
        for cc in df.columns:
            if cc.startswith('SW_IN'):
                sw_id = cc
                break
        assert sw_id, 'Global radiation with name SW or starting with SW_ must be in input if isday not given.'
        isday = df[
            sw_id] > swthr  # Papale et al. (Biogeosciences, 2006): 20; REddyProc: 10
    if isinstance(isday, (pd.core.series.Series, pd.core.frame.DataFrame)):
        isday = isday.to_numpy()
    isday[isday == undef] = np.nan
    ff[np.isnan(isday)] = 1

    # parameters
    nrow, ncol = df.shape
    half_scan_win = nscan // 2
    half_fill_win = nfill // 2

    # calculate dusk and dawn times and separate in day and night
    isdawn = np.zeros(nrow, dtype=np.bool)
    isdusk = np.zeros(nrow, dtype=np.bool)
    dis = isday.astype(int) - np.roll(isday, -1).astype(int)  # .astype(bool)
    isdawn[:-1] = np.where(dis[:-1] == -1, True, False)
    isdusk[:-1] = np.where(dis[:-1] == 1, True, False)
    isddday = isdawn
    tmp = np.roll(isdusk, 1)
    isddday[1:] += tmp[1:]  # start and end of day
    isddnight = isdusk
    tmp = np.roll(isdawn, 1)
    isddnight[1:] += tmp[1:]  # start and end of night

    # iterate over each column of data
    if plot:
        import matplotlib.pyplot as plt
        import matplotlib.backends.backend_pdf as pdf
        pd.plotting.register_matplotlib_converters()
        pp = pdf.PdfPages('madspikes.pdf')

    cols = list(df.columns)
    for hcol in df.columns:

        if hcol.startswith == 'SW_IN': continue

        data = df[hcol]
        dflag = ff[hcol]

        # get day and night data
        data_day = data.copy(deep=True)
        data_day[~(isday | isddday) | (dflag != 0) | (data == undef)] = np.nan
        data_night = data.copy(deep=True)
        data_night[~(~isday | isddnight) | (dflag != 0) |
                   (data == undef)] = np.nan

        # iterate over fill window
        for j in range(half_fill_win, nrow - 1, 2 * half_fill_win):
            j1 = max(j - half_scan_win - 1, 0)
            j2 = min(j + half_scan_win + 1, nrow)
            fill_start = max(j - half_fill_win, 1)
            fill_end = min(j + half_fill_win, nrow - 1)

            dd = data_day[j1:j2].to_numpy()
            day_flag = mad(np.ma.masked_array(data=dd, mask=np.isnan(dd)),
                           z=z,
                           deriv=deriv)
            ff.iloc[fill_start:fill_end, cols.index(hcol)] += np.where(
                day_flag[fill_start - j1 - 1:fill_end - j1 - 1], 2, 0)

            nn = data_night[j1:j2]
            night_flag = mad(np.ma.masked_array(data=nn, mask=np.isnan(nn)),
                             z=z,
                             deriv=deriv)
            ff.iloc[fill_start:fill_end, cols.index(hcol)] += np.where(
                night_flag[fill_start - j1 - 1:fill_end - j1 - 1], 2, 0)

        if plot:
            fig = plt.figure(1)
            sub = fig.add_subplot(111)
            valid = ff[hcol] == 0
            l1 = sub.plot(data[valid], 'ob')
            l3 = sub.plot(data[ff[hcol] == 2], 'or')
            plt.title(hcol)
            pp.savefig(fig)
            plt.close(fig)

    # Finish

    if plot:
        pp.close()

    if fisnumpy:
        if fistrans:
            return ff.to_numpy().T
        else:
            return ff.to_numpy()
    else:
        return ff