def calc_climatology(Ser, moving_avg_orig=5, moving_avg_clim=30, median=False, timespan=None): ''' Calculates the climatology of a data set Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex or julian date) moving_avg_orig : float, optional The size of the moving_average window [days] that will be applied on the input Series (gap filling, short-term rainfall correction) Default: 5 moving_avg_clim : float, optional The size of the moving_average window [days] that will be applied on the calculated climatology (long-term event correction) Default: 35 median : boolean, optional if set to True, the climatology will be based on the median conditions timespan : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional Set this to calculate the climatology based on a subset of the input Series Returns ------- climatology : pandas.Series Series containing the calculated climatology ''' if timespan is not None: Ser = Ser.truncate(before=timespan[0], after=timespan[1]) Ser = moving_average(Ser, window_size=moving_avg_orig) Ser = pd.DataFrame(Ser) if type(Ser.index) == pd.DatetimeIndex: doys = doy(Ser.index.month, Ser.index.day) else: year, month, day = julian2date(Ser.index.values)[0:3] doys = doy(month, day) Ser['doy'] = doys if median: clim = Ser.groupby('doy').median() else: clim = Ser.groupby('doy').mean() return moving_average(pd.Series(clim.values.flatten(), index=clim.index.values), window_size=moving_avg_clim)
def calc_anomaly(Ser, window_size=35, climatology=None): ''' Calculates the anomaly of a time series (Pandas series). Both, climatology based, or moving-average based anomalies can be calculated Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex) window_size : float, optional The window-size [days] of the moving-average window to calculate the anomaly reference (only used if climatology is not provided) Default: 35 (days) climatology : pandas.Series (index: 1-366), optional if provided, anomalies will be based on the climatology timespann : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional If set, only a subset Returns ------- anomaly : pandas.Series Series containing the calculated anomalies ''' if climatology is not None: if type(Ser.index) == pd.DatetimeIndex: doys = doy(Ser.index.month, Ser.index.day) else: year, month, day = julian2date(Ser.index.values)[0:3] doys = doy(month, day) df = pd.DataFrame() df['absolute'] = Ser df['doy'] = doys clim = pd.DataFrame(climatology, columns=['climatology']) df = df.join(clim, on='doy', how='left') anomaly = df['absolute'] - df['climatology'] anomaly.index = df.index else: reference = moving_average(Ser, window_size=window_size) anomaly = Ser - reference return anomaly
def test_doy(): day_of_year = doy(1, 28) assert day_of_year == 28 day_of_year = doy(2, 29) assert day_of_year == 31 + 29 day_of_year = doy(3, 1, year=2004) assert day_of_year == 31 + 29 + 1 # test numpy arrays as input days = np.array([28, 29, 1], dtype=int) months = np.array([1, 2, 3]) days_of_year = doy(months, days, year=np.array([2005, 2004, 2004])) nptest.assert_allclose(days_of_year, np.array([28, 31 + 29, 31 + 29 + 1])) days_of_year = doy(months, days, year=2004) nptest.assert_allclose(days_of_year, np.array([28, 31 + 29, 31 + 29 + 1]))
def calc_climatology(Ser, moving_avg_orig=5, moving_avg_clim=30, median=False, timespan=None): ''' Calculates the climatology of a data set Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex) moving_avg_orig : float, optional The size of the moving_average window [days] that will be applied on the input Series (gap filling, short-term rainfall correction) Default: 5 moving_avg_clim : float, optional The size of the moving_average window [days] that will be applied on the calculated climatology (long-term event correction) Default: 35 median : boolean, optional if set to True, the climatology will be based on the median conditions timespan : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional Set this to calculate the climatology based on a subset of the input Series Returns ------- climatology : pandas.Series Series containing the calculated climatology ''' if timespan is not None: Ser = Ser.truncate(before=timespan[0], after=timespan[1]) Ser = moving_average(Ser, window_size=moving_avg_orig, sample_to_days=True, fast=True) Ser = pd.DataFrame(Ser) Ser['doy'] = doy(Ser.index.month, Ser.index.day) if median: clim = Ser.groupby('doy').median() else: clim = Ser.groupby('doy').mean() return moving_average(pd.Series(clim.values.flatten(), index=clim.index.values), window_size=moving_avg_clim, no_date=True)
def calc_anomaly(Ser, window_size=35, climatology=None): ''' Calculates the anomaly of a time series (Pandas series). Both, climatology based, or moving-average based anomalies can be calculated Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex) window_size : float, optional The window-size [days] of the moving-average window to calculate the anomaly reference (only used if climatology is not provided) Default: 35 (days) climatology : pandas.Series (index: 1-366), optional if provided, anomalies will be based on the climatology timespann : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional If set, only a subset Returns ------- anomaly : pandas.Series Series containing the calculated anomalies ''' if climatology is not None: Ser = pd.DataFrame(Ser, columns=['absolute']) Ser['doy'] = doy(Ser.index.month, Ser.index.day) clim = pd.DataFrame(climatology, columns=['climatology']) Ser = Ser.join(clim, on='doy', how='left') anomaly = Ser['absolute'] - Ser['climatology'] anomaly.index = Ser.index else: reference = moving_average(Ser, window_size=window_size, fast=True) anomaly = Ser - reference return anomaly
def calc_climatology(Ser, moving_avg_orig=5, moving_avg_clim=30, median=False, timespan=None, fill=np.nan, wraparound=False): ''' Calculates the climatology of a data set. Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex or julian date) moving_avg_orig : float, optional The size of the moving_average window [days] that will be applied on the input Series (gap filling, short-term rainfall correction) Default: 5 moving_avg_clim : float, optional The size of the moving_average window [days] that will be applied on the calculated climatology (long-term event correction) Default: 35 median : boolean, optional if set to True, the climatology will be based on the median conditions timespan : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional Set this to calculate the climatology based on a subset of the input Series fill : float or int, optional Fill value to use for days on which no climatology exists wraparound : boolean, optional If set then the climatology is wrapped around at the edges before doing the second running average (long-term event correction) Returns ------- climatology : pandas.Series Series containing the calculated climatology Always has 366 values behaving like a leap year ''' if timespan is not None: Ser = Ser.truncate(before=timespan[0], after=timespan[1]) Ser = moving_average(Ser, window_size=moving_avg_orig) Ser = pd.DataFrame(Ser) if type(Ser.index) == pd.DatetimeIndex: doys = doy(Ser.index.month, Ser.index.day) else: year, month, day = julian2date(Ser.index.values)[0:3] doys = doy(month, day) Ser['doy'] = doys if median: clim = Ser.groupby('doy').median() else: clim = Ser.groupby('doy').mean() clim_ser = pd.Series(clim.values.flatten(), index=clim.index.values) if wraparound: index_old = clim_ser.index.copy() left_mirror = clim_ser.iloc[-moving_avg_clim:] right_mirror = clim_ser.iloc[:moving_avg_clim] # Shift index to start at 366 - index at -moving_avg_clim # to run over a whole year while keeping gaps the same size right_mirror.index = right_mirror.index + 366 * 2 clim_ser.index = clim_ser.index + 366 clim_ser = pd.concat([left_mirror, clim_ser, right_mirror]) clim_ser = moving_average(clim_ser, window_size=moving_avg_clim) clim_ser = clim_ser.iloc[moving_avg_clim:-moving_avg_clim] clim_ser.index = index_old else: clim_ser = moving_average(clim_ser, window_size=moving_avg_clim) clim_ser = clim_ser.reindex(np.arange(366) + 1) clim_ser = clim_ser.fillna(fill) return clim_ser
def calc_anomaly(Ser, window_size=35, climatology=None, respect_leap_years=True, return_clim=False): ''' Calculates the anomaly of a time series (Pandas series). Both, climatology based, or moving-average based anomalies can be calculated Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex) window_size : float, optional The window-size [days] of the moving-average window to calculate the anomaly reference (only used if climatology is not provided) Default: 35 (days) climatology : pandas.Series (index: 1-366), optional if provided, anomalies will be based on the climatology timespan : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional If set, only a subset respect_leap_years : boolean, optional If set then leap years will be respected during matching of the climatology to the time series return_clim : boolean, optional if set to true the return argument will be a DataFrame which also contains the climatology time series. Only has an effect if climatology is used. Returns ------- anomaly : pandas.Series Series containing the calculated anomalies ''' if climatology is not None: if type(Ser.index) == pd.DatetimeIndex: year, month, day = (np.asarray(Ser.index.year), np.asarray(Ser.index.month), np.asarray(Ser.index.day)) else: year, month, day = julian2date(Ser.index.values)[0:3] if respect_leap_years: doys = doy(month, day, year) else: doys = doy(month, day) df = pd.DataFrame() df['absolute'] = Ser df['doy'] = doys clim = pd.DataFrame({'climatology': climatology}) df = df.join(clim, on='doy', how='left') anomaly = df['absolute'] - df['climatology'] anomaly.index = df.index if return_clim: anomaly = pd.DataFrame({'anomaly': anomaly}) anomaly['climatology'] = df['climatology'] else: reference = moving_average(Ser, window_size=window_size) anomaly = Ser - reference return anomaly
def calc_climatology(Ser, moving_avg_orig=5, moving_avg_clim=30, median=False, timespan=None, fill=np.nan, wraparound=False, respect_leap_years=False, interpolate_leapday=False, fillna=True, min_obs_orig=1, min_obs_clim=1): ''' Calculates the climatology of a data set. Parameters ---------- Ser : pandas.Series (index must be a DateTimeIndex or julian date) moving_avg_orig : float, optional The size of the moving_average window [days] that will be applied on the input Series (gap filling, short-term rainfall correction) Default: 5 moving_avg_clim : float, optional The size of the moving_average window [days] that will be applied on the calculated climatology (long-term event correction) Default: 35 median : boolean, optional if set to True, the climatology will be based on the median conditions timespan : [timespan_from, timespan_to], datetime.datetime(y,m,d), optional Set this to calculate the climatology based on a subset of the input Series fill : float or int, optional Fill value to use for days on which no climatology exists wraparound : boolean, optional If set then the climatology is wrapped around at the edges before doing the second running average (long-term event correction) respect_leap_years : boolean, optional If set then leap years will be respected during the calculation of the climatology Default: False fillna: boolean, optional If set, then the moving average used for the calculation of the climatology will be filled at the nan-values min_obs_orig: int Minimum observations required to give a valid output in the first moving average applied on the input series min_obs_clim: int Minimum observations required to give a valid output in the second moving average applied on the calculated climatology Returns ------- climatology : pandas.Series Series containing the calculated climatology Always has 366 values behaving like a leap year ''' if timespan is not None: Ser = Ser.truncate(before=timespan[0], after=timespan[1]) Ser = moving_average(Ser, window_size=moving_avg_orig, fillna=fillna, min_obs=min_obs_orig) Ser = pd.DataFrame(Ser) if type(Ser.index) == pd.DatetimeIndex: year, month, day = (np.asarray(Ser.index.year), np.asarray(Ser.index.month), np.asarray(Ser.index.day)) else: year, month, day = julian2date(Ser.index.values)[0:3] if respect_leap_years: doys = doy(month, day, year) else: doys = doy(month, day) Ser['doy'] = doys if median: clim = Ser.groupby('doy').median() else: clim = Ser.groupby('doy').mean() clim_ser = pd.Series(clim.values.flatten(), index=clim.index.values) if interpolate_leapday and not respect_leap_years: clim_ser[60] = np.mean((clim_ser[59], clim_ser[61])) elif interpolate_leapday and respect_leap_years: clim_ser[366] = np.mean((clim_ser[365], clim_ser[1])) if wraparound: index_old = clim_ser.index.copy() left_mirror = clim_ser.iloc[-moving_avg_clim:] right_mirror = clim_ser.iloc[:moving_avg_clim] # Shift index to start at 366 - index at -moving_avg_clim # to run over a whole year while keeping gaps the same size right_mirror.index = right_mirror.index + 366 * 2 clim_ser.index = clim_ser.index + 366 clim_ser = pd.concat([left_mirror, clim_ser, right_mirror]) clim_ser = moving_average(clim_ser, window_size=moving_avg_clim, fillna=fillna, min_obs=min_obs_clim) clim_ser = clim_ser.iloc[moving_avg_clim:-moving_avg_clim] clim_ser.index = index_old else: clim_ser = moving_average(clim_ser, window_size=moving_avg_clim, fillna=fillna, min_obs=min_obs_clim) clim_ser = clim_ser.reindex(np.arange(366) + 1) clim_ser = clim_ser.fillna(fill) return clim_ser
def _read_gp(self,gpi,**kwargs): """ reads the time series of the given grid point index. Masks frozen and snow observations if keywords are present Parameters ---------- gpi : long grid point index mask_frozen_prob : int,optional if included in kwargs then all observations taken when frozen probability > mask_frozen_prob are removed from the result mask_snow_prob : int,optional if included in kwargs then all observations taken when snow probability > mask_snow_prob are removed from the result Returns ------- df : pandas.DataFrame containing all fields in the list self.include_in_df plus frozen_prob and snow_prob if a path to advisory flags was set during initialization """ index = np.where(gpi == self.gpis)[0] cell = self.cells[index][0] gp_file = os.path.join(self.path,'%4d'%cell,self.gp_filename_template%gpi) if not os.path.exists(gp_file): print 'first time reading from cell %4d unzipping ...'%cell self.unzip_cell(cell) data = np.fromfile(gp_file,dtype=self.gp_filestruct) dates = data['DAT'] datetime_parser = np.vectorize(self._datetime_arr) datetimes_correct = datetime_parser(dates) dict_df={} for into_df in self.include_in_df: d = np.ma.asarray(data[into_df],dtype=self.datatype[into_df]) d = np.ma.masked_equal(d,self.nan_values[into_df]) if self.scale_factor.has_key(into_df): d = d * self.scale_factor[into_df] dict_df[into_df] = d df = pd.DataFrame(dict_df,index=datetimes_correct) if self.include_advflags: adv_flags,topo,water = self.read_advisory_flags(gpi) if topo >= self.topo_threshold: warnings.warn("Warning gpi shows topographic complexity of %d %%. Data might not be usable."%topo) if water >= self.water_threshold: warnings.warn("Warning gpi shows water fraction of %d %%. Data might not be usable."%water) df['doy'] = doy(df.index.month, df.index.day) df = df.join(adv_flags,on='doy',how='left') del df['doy'] if 'mask_frozen_prob' in kwargs: mask_frozen = kwargs['mask_frozen_prob'] df = df[df['frozen_prob']<=mask_frozen] if 'mask_snow_prob' in kwargs: mask_snow = kwargs['mask_snow_prob'] df = df[df['snow_prob']<=mask_snow] lon,lat = self.gpi2lonlat(gpi) return df,gpi,lon,lat,cell
def _read_gp(self, gpi, **kwargs): """ reads the time series of the given grid point index. Masks frozen and snow observations if keywords are present Parameters ---------- gpi : long grid point index mask_frozen_prob : int,optional if included in kwargs then all observations taken when frozen probability > mask_frozen_prob are removed from the result mask_snow_prob : int,optional if included in kwargs then all observations taken when snow probability > mask_snow_prob are removed from the result absolute_values : boolean, optional if True soil porosities from HWSD and GLDAS will be used to derive absolute values which will be available in the pandas.DataFrame in the columns 'sm_por_gldas','sm_noise_por_gldas', 'sm_por_hwsd','sm_noise_por_hwsd' Returns ------- df : pandas.DataFrame containing all fields in the list self.include_in_df plus frozen_prob and snow_prob if a path to advisory flags was set during initialization gpi : long grid point index lon : float longitude lat : float latitude cell : int cell number topo : int topographic complexity wetland : int wetland fraction porosity : dict porosity values for 'gldas' and 'hwsd' """ if not self.grid_info_loaded: self._load_grid_info() cell = self.grid.gpi2cell(gpi) if self.prev_cell != cell: # new cell - means new file object and new read bulk if # set ncfile = netCDF4.Dataset( os.path.join(self.path, self.netcdftemplate % cell), 'r') self.units = ncfile.variables['time'].units if self.read_bulk: self.variables = {} for var in ncfile.variables: self.variables[var] = ncfile.variables[var][:] ncfile.close() ncfile = self gpi_index = np.where(ncfile.variables[self.loc_id][:] == gpi)[0] time_series_length = ncfile.variables[self.obs_var][gpi_index] startindex = np.sum(ncfile.variables[self.obs_var][:gpi_index]) endindex = startindex + time_series_length timestamps = netCDF4.num2date(ncfile.variables['time'][startindex:endindex], self.units) dict_df = {} for into_df in self.include_in_df: d = ncfile.variables[into_df][startindex:endindex] dict_df[into_df] = d df = pd.DataFrame(dict_df, index=timestamps) # read porosity values porosity = {} for por_source in ['gldas', 'hwsd']: porosity[por_source] = ncfile.variables[ 'por_%s' % por_source][gpi_index][0] if 'absolute_values' in kwargs: if kwargs['absolute_values']: for por_source in ['gldas', 'hwsd']: for el in self.to_absolute: df['%s_por_%s' % (el, por_source)] = ( df[el] / 100.0) * (porosity[por_source]) topo = ncfile.variables[self.topo_var][gpi_index][0] wetland = ncfile.variables[self.wetland_var][gpi_index][0] snow = np.squeeze(ncfile.variables[self.snow_var][gpi_index, :]) # if data is not valid assume no snow if type(snow) == np.ma.masked_array: warnings.warn('Snow probabilities not valid, assuming no snow') snow = snow.filled(0) frozen = np.squeeze(ncfile.variables[self.frozen_var][gpi_index, :]) # if data is not valid assume no freezing if type(frozen) == np.ma.masked_array: warnings.warn( 'Frozen probabilities not valid, assuming no freezing') frozen = frozen.filled(0) adv_flags = pd.DataFrame({'snow_prob': snow, 'frozen_prob': frozen}) if topo >= self.topo_threshold: warnings.warn( "Warning gpi shows topographic complexity of %d %%. Data might not be usable." % topo) if wetland >= self.wetland_threshold: warnings.warn( "Warning gpi shows wetland fraction of %d %%. Data might not be usable." % wetland) df['doy'] = doy(df.index.month, df.index.day) df = df.join(adv_flags, on='doy', how='left') del df['doy'] if 'mask_frozen_prob' in kwargs: mask_frozen = kwargs['mask_frozen_prob'] df = df[df['frozen_prob'] <= mask_frozen] if 'mask_snow_prob' in kwargs: mask_snow = kwargs['mask_snow_prob'] df = df[df['snow_prob'] <= mask_snow] lon, lat = self.grid.gpi2lonlat(gpi) if not self.read_bulk: ncfile.close() return df, gpi, lon, lat, cell, topo, wetland, porosity