def test_4(): m = random.choice(range(1,10)) e = random.choice(range(4)) n = m*10**e n |= 1 # ensure odd l = list(range(n)) random.shuffle(l) # median is in this case also the max deviation. median = (n-1)/2 assert math.ceil(median/2.0) == mad.mad(l)
def test(): result = {} func = { 'sim_euc': similarity.sim_euc, 'sim_man': similarity.sim_man, 'sim_Lmax': similarity.sim_Lmax } for fun in func: #For all three Metrics for k in [10, 50, 100]: #check for three different k values(Handpicked) result.setdefault(fun, {}) for i in [1, 2, 3, 4, 5]: # 5 fold cross validation madScore = 0 madlist = [] print("Running", i, "Validation for function", fun, "k value is", k) #print("In script") #print('./Datastore/rec_'+str(fun)+ '_'+str(k) + '_'+ str(i)+'.json') recEngineAlgo.getRecommendation( i, load.readPrefs('./Datastore/prefsTrain' + str(i) + '.json'), load.readPrefs('./Datastore/movies.json'), k, func[fun]) madScore = mad.mad( load.readPrefs('./Datastore/rec_' + str(fun) + '_' + str(k) + '_' + str(i) + '.json'), load.readPrefs('./Datastore/prefsTest' + str(i) + '.json')) print("Madscore for", i, "validation for function", fun, "with k value", k, "is", madScore) madlist.append(madScore) for item in madlist: #Taking average Mad madScore = madScore + item print("Final MadScore for func:", str(fun), "and k:", k, "is:", round(madScore / 5, 2)) result[fun][k] = round(madScore / 5, 2) print(result) with open('./Datastore/result.json', 'w') as fp: #storing the result in dictionary json.dump(result, fp)
def _nee2gpp_reichstein(df, ff, isday, undef=-9999, nogppnight=False): """ Calculate photosynthesis (GPP) and ecosystem respiration (RECO) from original Eddy flux data, using several fits of Reco vs. temperature of nighttime data over the season, as in Reichstein et al. (2005), in order to calculate Reco and then GPP = Reco - NEE. Parameters ---------- df : pandas.Dataframe time series of CO2 fluxes and air temperature. pandas.Dataframe with the columns 'FC' or 'NEE' (or starting with 'FC\_' or 'NEE\_') for observed CO2 flux [umol(CO2) m-2 s-1] 'TA' (or starting with 'TA\_') for air temperature [K] The index is taken as date variable. ff : pandas.Dataframe flag Dataframe or array has the same shape as `df`. Non-zero values in `ff` will be treated as missing values in `df`. `ff` must follow the same rules as `df`. isday : array_like of bool True when it is day, False when night. Must have the same length as `df.shape[0].` undef : float, optional values having `undef` value are treated as missing values in `df` (default: -9999) nogppnight : float, optional GPP will be set to zero at night. RECO will then equal NEE at night (default: False) Returns ------- pandas.Dataframe pandas.Dataframe with two columns 'GPP' and 'RECO' with estimated photosynthesis and ecosystem respiration. Literature ---------- Reichstein et al. (2005) On the separation of net ecosystem exchange into assimilation and ecosystem respiration: review and improved algorithm, Global Change Biology 11, 1424-1439 Examples -------- >>> from fread import fread >>> from date2dec import date2dec >>> from dec2date import dec2date >>> ifile = 'test_nee2gpp.csv' >>> undef = -9999. >>> dat = fread(ifile, skip=2, transpose=True) >>> ndat = dat.shape[1] >>> head = fread(ifile, skip=2, header=True) >>> head1 = head[0] >>> # date >>> jdate = date2dec(dy=dat[0,:], mo=dat[1,:], yr=dat[2,:], hr=dat[3,:], mi=dat[4,:]) >>> adate = dec2date(jdate, eng=True) >>> # colhead >>> idx = [] >>> for i in head1: ... if i in ['NEE', 'rg', 'Tair', 'VPD']: idx.append(head1.index(i)) >>> colhead = ['FC', 'SW_IN', 'TA', 'VPD'] >>> # data >>> dfin = dat[idx,:] >>> dfin[2,:] = np.where(dfin[2,:] == undef, undef, dfin[2,:]+273.15) >>> dfin[3,:] = np.where(dfin[3,:] == undef, undef, dfin[3,:]*100.) >>> # flag >>> flag = np.where(dfin == undef, 2, 0) >>> # partition >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='local') >>> print(GPP[1120:1128]) [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03 4.40606871e+00 8.31942152e+00 1.06242542e+01 8.49245664e+00 1.12381973e+01] >>> print(Reco[1120:1128]) [1.68311981 1.81012431 1.9874173 2.17108871 2.38759152 2.64372415 2.90076664 3.18592735] >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='local') >>> print(GPP[1120:1128]) [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03 4.40606871e+00 8.31942152e+00 1.06242542e+01 8.49245664e+00 1.12381973e+01] >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='Reichstein') >>> print(GPP[1120:1128]) [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03 4.40606871e+00 8.31942152e+00 1.06242542e+01 8.49245664e+00 1.12381973e+01] >>> GPP, Reco = nee2gpp(dfin, flag=flag, date=adate, colhead=colhead, undef=undef, method='reichstein') >>> print(GPP[1120:1128]) [-9.99900000e+03 -9.99900000e+03 -9.99900000e+03 4.40606871e+00 8.31942152e+00 1.06242542e+01 8.49245664e+00 1.12381973e+01] History ------- Written Matthias Cuntz, Mar 2012 Modified Arndt Piayda, Mar 2012 - undef=np.nan Matthias Cuntz, Nov 2012 - individual routine Matthias Cuntz, Feb 2013 - ported to Python 3 """ # Variables fc_id = '' for cc in df.columns: if cc.startswith('FC_') or (cc == 'FC') or cc.startswith('NEE_') or ( cc == 'NEE'): fc_id = cc break ta_id = '' for cc in df.columns: if cc.startswith('TA_') or (cc == 'TA'): ta_id = cc break assert fc_id, 'Carbon net flux with name FC or NEE or starting with FC_ or NEE_ must be in input.' assert ta_id, 'Air temperature with name TA or starting with TA_ must be in input.' nee = np.ma.array(df[fc_id], mask=(ff[fc_id] > 0)) t = np.ma.array(df[ta_id], mask=(ff[ta_id] > 0)) misday = np.ma.array(isday, mask=((~np.isfinite(isday)) | (isday == undef))) dates = df.index.to_julian_date() # Partition - Local relationship = Reichstein et al. (2005) ndata = nee.size GPP = np.ones(ndata) * undef Reco = np.ones(ndata) * undef dfout = pd.DataFrame({'GPP': GPP, 'RECO': Reco}, index=df.index) # Select valid nighttime mask = misday | nee.mask | t.mask | misday.mask ii = np.where(~mask)[0] if (ii.size == 0): # raise ValueError('Error _nee2gpp_reichstein: no valid nighttime data.') print('Warning _nee2gpp_reichstein: no valid nighttime data.') return dfout jul = dates[ii] tt = np.ma.compressed(t[ii]) net = np.ma.compressed(nee[ii]) # 1. each 5 days, in 15 day period, fit if range of T > 5 locp = [] # local param locs = [] # local err dmin = np.floor(np.amin(jul)).astype( np.int) # be aware that julian days starts at noon, i.e. 1.0 is 12h dmax = np.ceil(np.amax(jul)).astype( np.int ) # so the search will be from noon to noon and thus includes all nights for i in range(dmin, dmax, 5): iii = np.where((jul >= i) & (jul < (i + 14)))[0] niii = iii.size if niii > 6: tt1 = tt[iii] net1 = net[iii] mm = ~mad(net1, z=4.5) # make fit more robust by removing outliers if (np.ptp(tt[iii]) >= 5.) & (np.sum(mm) > 6): p, temp1, temp2 = opt.fmin_tnc(cost_lloyd_fix, [2., 200.], bounds=[[0., None], [0., None]], args=(tt1[mm], net1[mm]), approx_grad=True, disp=False) try: p1, c = opt.curve_fit(lloyd_fix, tt1[mm], net1[mm], p0=p, maxfev=10000) # params, covariance if np.all(np.isfinite( c)): # possible return of curvefit: c=inf s = np.sqrt(np.diag(c)) else: s = 10. * np.abs(p) except: s = 10. * np.abs(p) locp += [p] locs += [s] # if ((s[1]/p[1])<0.5) & (p[1] > 0.): pdb.set_trace() if len(locp) == 0: # raise ValueError('Error _nee2gpp_reichstein: No local relationship found.') print('Warning _nee2gpp_reichstein: No local relationship found.') return dfout locp = np.squeeze(np.array(locp).astype(np.float)) locs = np.squeeze(np.array(locs).astype(np.float)) # 2. E0 = avg of best 3 # Reichstein et al. (2005), p. 1430, 1st paragraph. with warnings.catch_warnings(): warnings.simplefilter("ignore") iii = np.where((locp[:, 1] > 0.) & (locp[:, 1] < 450.) & (np.abs(locs[:, 1] / locp[:, 1]) < 0.5))[0] niii = iii.size if niii == 0: # raise ValueError('Error _nee2gpp_reichstein: No good local relationship found.') # loosen the criteria: take the best three estimates anyway iii = np.where((locp[:, 1] > 0.))[0] niii = iii.size if niii < 1: # raise ValueError('Error _nee2gpp_reichstein: No E0>0 found.') print('Warning _nee2gpp_reichstein: No E0>0 found.') return dfout lp = locp[iii, :] ls = locs[iii, :] iis = np.argsort(ls[:, 1]) bestp = np.mean(lp[iis[0:np.minimum(3, niii)], :], axis=0) bests = np.mean(ls[iis[0:np.minimum(3, niii)], :], axis=0) elif niii == 1: bestp = np.squeeze(locp[iii, :]) bests = np.squeeze(locs[iii, :]) elif niii == 2: bestp = np.mean(locp[iii, :], axis=0) bests = np.mean(locs[iii, :], axis=0) # ls = locs[iii,:] # iis = np.argsort(ls[:,1]) else: lp = locp[iii, :] ls = locs[iii, :] iis = np.argsort(ls[:, 1]) bestp = np.mean(lp[iis[0:3], :], axis=0) bests = np.mean(ls[iis[0:3], :], axis=0) # 3. Refit Rref with fixed E0, each 4 days refp = [] # Rref param refii = [] # mean index of data points E0 = bestp[1] et = lloyd_fix(tt, 1., E0) for i in range(dmin, dmax, 4): iii = np.where((jul >= i) & (jul < (i + 4)))[0] niii = iii.size if niii > 3: # Calc directly minisation of (nee-p*et)**2 p, temp1, temp2 = opt.fmin_tnc(cost_abs, [2.], bounds=[[0., None]], args=(lloyd_only_rref_p, et[iii], net[iii]), approx_grad=True, disp=False) refp += [p] refii += [np.int((iii[0] + iii[-1]) // 2)] if len(refp) == 0: # raise ValueError('Error _nee2gpp_reichstein: No ref relationship found.') print('Warning _nee2gpp_reichstein: No ref relationship found.') return dfout refp = np.squeeze(np.array(refp)) refii = np.squeeze(np.array(refii)) # 4. Interpol Rref Rref = np.interp(dates, jul[refii], refp) # 5. Calc Reco Reco = np.ones(ndata) * undef ii = np.where(~t.mask)[0] Reco[ii] = lloyd_fix(t[ii], Rref[ii], E0) # 6. Calc GPP GPP = np.ones(ndata) * undef ii = np.where(~(t.mask | nee.mask))[0] GPP[ii] = Reco[ii] - nee[ii] # 7. Set GPP=0 at night, if wanted if nogppnight: mask = misday | nee.mask | t.mask | misday.mask # night ii = np.where(~mask)[0] Reco[ii] = nee[ii] GPP[ii] = 0. # and prohibit negative gpp at any time mask = nee.mask | t.mask | (GPP > 0.) ii = np.where(~mask)[0] Reco[ii] -= GPP[ii] GPP[ii] = 0. dfout = pd.DataFrame({'GPP': GPP, 'RECO': Reco}, index=df.index) return dfout
def test_3(): assert 1 == mad.mad([1,2,3])
def madspikes(dfin, flag=None, isday=None, colhead=None, undef=-9999, nscan=15 * 48, nfill=1 * 48, z=7, deriv=2, swthr=10., plot=False): """ Spike detection for using a moving median absolute difference filter. Used with Eddy vovariance data in Papale et al. (Biogeosciences, 2006). Parameters ---------- dfin : pandas.Dataframe or numpy.array time series of data where spike detection with MAD should be applied. `dfin` can be a pandas.Dataframe. `dfin` can also me a numpy array. In this case `colhead` must be given. MAD will be applied along axis=0, i.e. on each column of axis=1. flag : pandas.Dataframe or numpy.array, optional flag Dataframe or array has the same shape as dfin. Non-zero values in `flag` will be treated as missing values in `dfin`. If `flag` is numpy array, `df.columns.values` will be used as column heads. isday : array_like of bool, optional True when it is day, False when night. Must have the same length as dfin.shape[0]. If `isday` is not given, `dfin` must have a column with head 'SW_IN' or starting with 'SW_IN'. `isday` will then be `dfin['SW_IN'] > swthr`. colhed : array_like of str, optional column names if `dfin` is numpy array. undef : float, optional values having `undef` value are treated as missing values in `dfin` (default: -9999) np.nan is not allowed (working). nscan : int, optional size of moving window to calculate mad in time steps (default: 15*48) nfill : int, optional step size of moving window to calculate mad in time steps (default: 1*48) mad will be calculated in `nscan` time window. Resulting mask will be applied only in `nfill` window in the middle of the `nscan` window. Then `nscan` window will be moved by `nfill` time steps. z : float, optional Input is allowed to deviate maximum `z` standard deviations from the median (default: 7) deriv : int, optional 0: Act on raw input. 1: Use first derivatives. 2: Use 2nd derivatives (default). swthr : float, optional Threshold to determine daytime from incoming shortwave radiation if `isday` not given (default: 10). plot : bool, optional True: data and spikes are plotted into madspikes.pdf (default: False). Returns ------- pandas.Dataframe or numpy array flags, 0 everywhere except detected spikes set to 2. History ------- Written, Matthias Cuntz & Tino Rau, 2008 Maintained, Arndt Piayda, Aug 2014 Modified, Matthias Cuntz, Apr 2020 - input can be pandas Dataframe or numpy array(s) - removed iteration Matthias Cuntz, May 2020 - numpy docstring format """ # numpy or panda if isinstance(dfin, (np.ndarray, np.ma.MaskedArray)): isnumpy = True istrans = False assert colhead is not None, 'colhead must be given if input is numpy.ndarray.' if dfin.shape[0] == len(colhead): istrans = True df = pd.DataFrame(dfin.T, columns=colhead) elif dfin.shape[1] == len(colhead): df = pd.DataFrame(dfin, columns=colhead) else: raise ValueError( 'Length of colhead must be number of columns in input array. len(colhead)=' + str(len(colhead)) + ' shape(input)=(' + str(dfin.shape[0]) + ',' + str(dfin.shape[1]) + ').') else: isnumpy = False istrans = False assert isinstance( dfin, pd.core.frame.DataFrame ), 'Input must be either numpy.ndarray or pandas.DataFrame.' df = dfin.copy(deep=True) # Incoming flags if flag is not None: if isinstance(flag, (np.ndarray, np.ma.MaskedArray)): fisnumpy = True fistrans = False if flag.shape[0] == len(df): ff = pd.DataFrame(flag, columns=df.columns.values) elif flag.shape[1] == len(df): fistrans = True ff = pd.DataFrame(flag.T, columns=df.columns.values) else: raise ValueError( 'flag must have same shape as data array. data: ({:d},{:d}); flag: ({:d},{:d})' .format(dfin.shape[0], dfin.shape[1], flag.shape[0], flag.shape[1])) ff = ff.set_index(df.index) else: fisnumpy = False fistrans = False assert isinstance( flag, pd.core.frame.DataFrame ), 'Flag must be either numpy.ndarray or pandas.DataFrame.' ff = flag.copy(deep=True) else: fisnumpy = isnumpy fistrans = istrans # flags: 0: good; 1: input flagged; 2: output flagged ff = df.copy(deep=True).astype(int) ff[:] = 0 ff[df == undef] = 1 ff[df.isna()] = 1 # day or night if isday is None: sw_id = '' for cc in df.columns: if cc.startswith('SW_IN'): sw_id = cc break assert sw_id, 'Global radiation with name SW or starting with SW_ must be in input if isday not given.' isday = df[ sw_id] > swthr # Papale et al. (Biogeosciences, 2006): 20; REddyProc: 10 if isinstance(isday, (pd.core.series.Series, pd.core.frame.DataFrame)): isday = isday.to_numpy() isday[isday == undef] = np.nan ff[np.isnan(isday)] = 1 # parameters nrow, ncol = df.shape half_scan_win = nscan // 2 half_fill_win = nfill // 2 # calculate dusk and dawn times and separate in day and night isdawn = np.zeros(nrow, dtype=np.bool) isdusk = np.zeros(nrow, dtype=np.bool) dis = isday.astype(int) - np.roll(isday, -1).astype(int) # .astype(bool) isdawn[:-1] = np.where(dis[:-1] == -1, True, False) isdusk[:-1] = np.where(dis[:-1] == 1, True, False) isddday = isdawn tmp = np.roll(isdusk, 1) isddday[1:] += tmp[1:] # start and end of day isddnight = isdusk tmp = np.roll(isdawn, 1) isddnight[1:] += tmp[1:] # start and end of night # iterate over each column of data if plot: import matplotlib.pyplot as plt import matplotlib.backends.backend_pdf as pdf pd.plotting.register_matplotlib_converters() pp = pdf.PdfPages('madspikes.pdf') cols = list(df.columns) for hcol in df.columns: if hcol.startswith == 'SW_IN': continue data = df[hcol] dflag = ff[hcol] # get day and night data data_day = data.copy(deep=True) data_day[~(isday | isddday) | (dflag != 0) | (data == undef)] = np.nan data_night = data.copy(deep=True) data_night[~(~isday | isddnight) | (dflag != 0) | (data == undef)] = np.nan # iterate over fill window for j in range(half_fill_win, nrow - 1, 2 * half_fill_win): j1 = max(j - half_scan_win - 1, 0) j2 = min(j + half_scan_win + 1, nrow) fill_start = max(j - half_fill_win, 1) fill_end = min(j + half_fill_win, nrow - 1) dd = data_day[j1:j2].to_numpy() day_flag = mad(np.ma.masked_array(data=dd, mask=np.isnan(dd)), z=z, deriv=deriv) ff.iloc[fill_start:fill_end, cols.index(hcol)] += np.where( day_flag[fill_start - j1 - 1:fill_end - j1 - 1], 2, 0) nn = data_night[j1:j2] night_flag = mad(np.ma.masked_array(data=nn, mask=np.isnan(nn)), z=z, deriv=deriv) ff.iloc[fill_start:fill_end, cols.index(hcol)] += np.where( night_flag[fill_start - j1 - 1:fill_end - j1 - 1], 2, 0) if plot: fig = plt.figure(1) sub = fig.add_subplot(111) valid = ff[hcol] == 0 l1 = sub.plot(data[valid], 'ob') l3 = sub.plot(data[ff[hcol] == 2], 'or') plt.title(hcol) pp.savefig(fig) plt.close(fig) # Finish if plot: pp.close() if fisnumpy: if fistrans: return ff.to_numpy().T else: return ff.to_numpy() else: return ff