def timedata(dt): # create time data extra information for influxDB (makes sorting easier) timestamp = datetime.fromtimestamp(dt) year = timestamp.year month = timestamp.month hour = timestamp.hour minute = timestamp.minute weekday = timestamp.weekday() # determine the holiday export weight # for DE only holiday, holiday = 1.0 # for DE+ others holidays, holiday > 1.0 # for only other holidays, holiday < 1.' holiday = 1 if holidays.Germany().get(timestamp) else 0 # DE holiday = 1 for cn, op in holiday_op: if cn == 'DE': continue if op.get(timestamp): holiday += exportWeights[cn][year].get(month-1) \ if exportWeights and exportWeights.get(cn) and exportWeights[cn].get(year) \ else country_pop[cn]/sum([pop for country,pop in country_pop.items() if country != 'DE']) return { 'timestamp': timestamp, 'year': year, 'month': month, 'hour': hour, 'minute': minute, 'weekday': weekday, 'holiday': holiday }
def __add_holiday_features(df): df.index = pd.to_datetime(df.index) df_holidays = holidays.Germany() cet_dates = pd.Series(df.index.tz_convert("CET"), index=df.index) df["holiday"] = cet_dates.apply(lambda d: d in df_holidays) df["holiday"] = df["holiday"].astype(int) return df
def impute_holiday(df): years = df.Date.dt.year.unique() national_holidays = [day for day in holidays.Germany(years=years)] missing_holiday = df.StateHoliday.isnull() holiday_date = df.Date.isin(national_holidays) df.loc[missing_holiday & holiday_date, 'StateHoliday'] = 'a' df.loc[missing_holiday & ~holiday_date, 'StateHoliday'] = '0' return df
def add_bank_holidays(df: pd.DataFrame): de_holidays = holidays.Germany(years=2016) df["bank_holiday"] = 0 for date in list(de_holidays.keys()): if date in df.index: df.bank_holiday.loc[date] = 1 if datetime.date(2016, 8, 15) not in list(de_holidays.keys()) and datetime.date(2016, 8, 15) in df.index: # Manually adding 15/08 since it is not in the de_holidays dictionary df.bank_holiday.loc["2016-08-15"] = 1 return df
def next_weekday_str(weekday, time_tamplate='%Y:%m:%d:%H:%M:%S'): germany_holidays = holidays.Germany() d = datetime.now() d = d.replace(hour=9, minute=0, second=0) day_number = 2 next_day = next_weekday(d, 2) # 0 = Monday, 1=Tuesday, 2=Wednesday... # This is some how is not working while next_day in germany_holidays: # print (next_day) # print (type(next_day)) day_number += 7 next_day = next_weekday(d, day_number) day_string = next_day.strftime(time_tamplate) return day_string
def __init__(self, inputCol=None, outputCol=None): super(TimestampTransformer, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs) self.day_to_str = { 0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday" } self.bins = 24 * 4 # number of time bins per day # Note: bins must evenly divide 60 self.minutes_per_bin = int((24. / float(self.bins)) * 60.) self.holidays = holidays.Germany(prov="BW")
'CH': 7.786, 'CZ': 10.460, 'DE': 81.750, 'DK': 5.535, 'FR': 64.610, 'LU': 0.502, 'NL': 16.570, 'PL': 38.53, 'SE': 9.341 } holiday_op = [ ('AT', holidays.Austria()), # holidays for each country ('BE', holidays.Belgium()), # implented this way because of bug in library ('CH', holidays.Switzerland()), ('CZ', holidays.Czech()), ('DE', holidays.Germany()), ('DK', holidays.Denmark()), ('FR', holidays.France()), ('LU', holidays.Luxembourg()), ('NL', holidays.Netherlands()), ('PL', holidays.Poland()), ('SE', holidays.Sweden()) ] def timedata(dt): # create time data extra information for influxDB (makes sorting easier) timestamp = datetime.fromtimestamp(dt) year = timestamp.year month = timestamp.month hour = timestamp.hour
'CH': 7.786, 'CZ': 10.460, 'DE': 81.750, 'DK': 5.535, 'FR': 64.610, 'LU': 0.502, 'NL': 16.570, 'PL': 38.53, 'SE': 9.341 } holiday_op = [ ('AT', holidays.Austria()), # holidays for each country ('BE', holidays.Belgium()), # implented this way because of bug in library ('CH', holidays.Switzerland()), ('CZ', holidays.Czech()), ('DE', holidays.Germany()), ('DK', holidays.Denmark()), ('FR', holidays.France()), ('LU', holidays.Luxembourg()), ('NL', holidays.Netherlands()), ('PL', holidays.Poland()), ('SE', holidays.Sweden()) ] #---------------------------------------------------------------------- logging.basicConfig(filename="./influxDBLoad.log", filemode="a", format='%(asctime)s %(message)s', datefmt='%Y.%m.%d %H:%M:%S', level=logging.INFO)
import holidays de_bw_holidays = holidays.Germany(state='BW', years=2021).items() for date, name in sorted(de_bw_holidays): print(date, name)
from datetime import date ### STATIC VARIABLES today = date.today() australia_holidays = holidays.Australia() austria_holidays = holidays.Austria() canada_holidays = holidays.Canada() colombia_holidays = holidays.Colombia() czech_holidays = holidays.Czech() denmark_holidays = holidays.Denmark() england_holidays = holidays.England() europeancentralbank_holidays = holidays.EuropeanCentralBank() germany_holidays = holidays.Germany() ireland_holidays = holidays.Ireland() mexico_holidays = holidays.Mexico() netherlands_holidays = holidays.Netherlands() newzealand_holidays = holidays.NewZealand() northernireland_holidays = holidays.NorthernIreland() norway_holidays = holidays.Norway() portugal_holidays = holidays.Portugal() portugalext_holidays = holidays.PortugalExt() scotland_holidays = holidays.Scotland() spain_holidays = holidays.Spain() unitedkingdom_holidays = holidays.UnitedKingdom() unitedstates_holidays = holidays.UnitedStates() wales_holidays = holidays.Wales() def isDateHoliday(date, countryHolidays):
def __init__(self, inputCol=DEPARTURE, outputCol=BEFORE_HOLIDAY): super().__init__() self._setDefault(inputCol=inputCol, outputCol=outputCol) self._set(inputCol=inputCol, outputCol=outputCol) self.holiday = holidays.Germany(prov='BY')
#nb different zip #raw_users.zip.nunique() #test=raw_users[raw_users.team != '202'] #test=raw_users[raw_users.zip == '86130'] users = raw_users.groupby('partition_date')['estimated_revenues'].sum() py.plot([go.Scatter(x=users.index, y=users)]) users = pd.DataFrame(users).reset_index() users.columns = ['ds', 'y'] users.head() holidays_c = pd.DataFrame( holidays.Germany(years=[2015, 2016, 2017, 2018, 2019]), index=[0]).transpose().reset_index() holidays_c.columns = ['ds', 'holiday'] m = Prophet(changepoint_prior_scale=2.5, holidays=holidays_c) m.fit(users) future = m.make_future_dataframe(periods=62) forecast = m.predict(future) py.plot([ go.Scatter(x=users['ds'], y=users['y'], name='y'), go.Scatter(x=forecast['ds'], y=forecast['yhat'], name='yhat'), go.Scatter(x=forecast['ds'], y=forecast['yhat_upper'], fill='tonexty', mode='none',
from datetime import date import holidays today = date.today() is_holiday = today in holidays.Germany() print(is_holiday)
from load_data import LoadData import matplotlib.pyplot as plt import holidays import pandas as pd h = holidays.Germany() def plot_df(df, **kwargs): dfy = df[df["Datum"].dt.year.between(2018, 2019)] # dfy = dfy[dfy["Datum"].dt.weekday < 5] # dfy = dfy[dfy.apply(lambda x: x["Datum"] not in h, axis=1)] dfy = dfy[dfy.apply(lambda x: x["Datum"] in h or x["Datum"].weekday() > 4, axis=1)] dg = dfy.groupby(dfy["Datum"].dt.month).mean() plt.plot(dg, **kwargs) pendlerstrecken = [1, 2, 4, 5, 6, 13] freizeitstrecken = [7, 9, 10, 11, 12] files = LoadData.load(freizeitstrecken) avg_df = pd.DataFrame({"Datum": [], "Zaehlerstand": []}) for key, data in files.items(): avg_df = avg_df.append(data) plot_df(data, label=LoadData.NAMINGS[key], linestyle="dotted", linewidth=1.5)
def validate(cls,args,data, rules=[], **kwargs): """ This function performs data validation and saves invalid data records to a table. It works as a sort of filter for the calling script, by not allowing invalid data to be inserted into the database. It receives a set of data, performs validations on each record and on the whole set, and returns a tuple with the valid and invalid data to the calling script. First, the function will perform some generic data validation rules and then it will perform the data validation rules supplied as an argument. Args: - data (string): JSON string containing the data records to validate. It should be of the form: '{"data":[ {"column1":"value1", "column2":"value2", "column3":"value3" ...}, {"column1":"value1", "column2":"value2", "column3":"value3" ...}, ] }' - rules (list): list of functions, where each function takes a JSON string containing the data records to validate, performs a validation rule and returns a tuple, where the first element is the collection of valid records, and the second element is the collection of invalid records. These records should be returned as lists of dict with the data; and in the case of invalid records, the data will be a JSON String, and the dict will be augmented with the reason, the rule, the script, and the time of Validation. For example, the collection of invalid records might look like this: [{"data":'{"column1":"value1"...}', "rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_value }, {"data":'{"column1":"value1"...}', "rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_value }, ... ] The collection of valid records will be simpler, just a list of dict, e.g. [{"column1":value1, "column2":value2}, {"column1":value1, "column2":value2} ] - kwargs: - 'google_key_path': the path to the service account key. Returns: tuple: tuple with: a list of dict with the valid records as the first element, and a list of dict with the invalid records as the second element. Each element of the invalid list is a tuple, containing the actual data (dict) as its first element, and another dict as its second element, with additional information like rule, reason, script, date_of_calidation and last_update_date. Exmple: Validate would return: (valid_list,invalid_list) where valid_list is: [ {"column1":value1, "column2":value2...}, {"column1":value1, "column2":value2...}, ... ] where invalid_list: [ (valid_record, additional_info), (valid_record,additional_info), ... ] where valid_record is: {"column1":value1, "column2":value2...} where additional_info is: {"rule":rule_value, "reason":reason_value, "script":script_value, "date_of_validation":date_of_validation_value, "last_update_date":last_update_date_value } """ def rule_1(row, tc, ntc, uk, de, script): for c in ntc: if row[c] is not None and row[c] != 0: return "valid" #All values were 0 #Check if they are holiday or weekend for c in tc: try: parsed = datetime.strptime(row[c], '%Y-%m-%d %H:%M:%S') except Exception as e: continue parsed_date = str(parsed.date()) if parsed_date in uk or parsed_date in de or not parsed.weekday(): if "price" in script: return "valid" else: rule_list.add("rule_1") return "invalid" #Not weekend or holiday rule_list.add("rule_1") return "invalid" def rule_4(row,nsc): for c in nsc: if row[c] != 0 and row[c] is not None: return "valid" rule_list.add("rule_4") return "invalid" data_decoded = jsonpickle.decode(data) valid_data = [] invalid_data = [] rule_list = set() reasons = { "rule_1": "Rule 1: all columns (except timestamp) are null or 0.", "rule_2": "Rule 2: the 4000 previously-validated records are identical.", "rule_3": "Rule 3: the same column had value of 0 or NULL for the last 5 rows.", "rule_4": "Rule 4: standard columns like Constituent name, id, date have values but all others are 0." } #First, perform generic data validation rules #Rule 1: check if all columns (except timestamp) are null or 0 #Get timestamp and non-timestamp columns script = kwargs["script"] uk_holidays = holidays.UK() de_holidays = holidays.Germany() df = pd.DataFrame(data_decoded["data"]) standard_columns = ['constituent_name', "constituent_id", "date", "last_update_date"] original_columns = df.columns timestamp_columns = [c for c in df.columns if ("date" in c or "time" in c)] non_timestamp_columns = [c for c in df.columns if ("date" not in c and "time" not in c)] non_standard_columns = [c for c in df.columns if c not in standard_columns + timestamp_columns] df["rule_1"] = df.apply(lambda x: rule_1(x,timestamp_columns,non_timestamp_columns,uk_holidays, de_holidays,script), axis=1) #Rule 2: Are the 4000 previously-validated records identical? invalid_indices = set() if df.shape[0] >= 4000: #We can apply rule start = 0 end = 3999 while end < df.shape[0]: df_temp = df.loc[start:end] df_duplicates = df_temp[df_temp.duplicated(subset=non_timestamp_columns, keep=False)] invalid_indices.update(list(df_duplicates.index)) start += 1 end += 1 df["rule_2"] = "valid" df.loc[list(invalid_indices), "rule_2"] = "invalid" if len(invalid_indices) > 0: rule_list.add("rule_2") #Rule 3: Has the same column had value of 0 or NULL for the last 5 rows? invalid_indices_2 = set() start = 0 end = 4 while end < df.shape[0]: for c in non_timestamp_columns: series = df.loc[start:end][c] if series.any() == False: invalid_indices_2.update(list(series.index)) start += 1 end += 1 df["rule_3"] = "valid" df.loc[list(invalid_indices_2), "rule_3"] = "invalid" if len(invalid_indices_2) > 0: rule_list.add("rule_3") #Rule 4: If standard columns like Constituent name, id, date have values but all others are 0 reject? df["rule_4"] = df.apply(lambda x: rule_4(x,non_standard_columns), axis=1) df["rule"] = object #Get invalid records invalid_indices = set() for i in range(0, df.shape[0]): row_rules = [] row = df.iloc[i] for r in ["rule_1","rule_2", "rule_3", "rule_4"]: if row[r] == "invalid": invalid_indices.add(i) row_rules.append(r) df.at[i, 'rule'] = row_rules #Get valid indices valid_indices = set(df.index.tolist()) valid_indices = valid_indices.difference(invalid_indices) valid_data += df.loc[list(valid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records') invalid_data += df.loc[list(invalid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records') #Custom rules custom_invalid = [] for func in rules: valid, invalid = func(jsonpickle.encode({"data":valid_data})) #print(invalid) #Check if valid valid_data = valid custom_invalid += invalid #Format invalid data invalid_data_store = [] #Add rule and reason for item in invalid_data: if "rule_1" in item["rule"] or "rule_4" in item["rule"]: continue additional_info = {} additional_info["rule"] = item["rule"] additional_info["reason"] = [reasons[r] for r in item["rule"] if isinstance(item["rule"], list)] additional_info["script"] = script additional_info["date_of_validation"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') additional_info["last_update_date"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') invalid_data_store.append((item,additional_info)) invalid_data_store += custom_invalid return list(valid_data), invalid_data_store
#!/usr/bin/env nix-shell #! nix-shell -i python3 -p python3Packages.dateutil -p python3Packages.holidays import argparse import holidays from dateutil.relativedelta import relativedelta from dateutil.rrule import rrule, DAILY from dateutil.parser import isoparse parser = argparse.ArgumentParser() parser.add_argument("dtstart") parser.add_argument("months", type=int) args = parser.parse_args() s = isoparse(args.dtstart) e = s + relativedelta(months=args.months) - relativedelta(days=1) r = rrule(freq=DAILY, dtstart=s, until=e) ct = len([d for d in r if d.weekday() < 5 and d not in holidays.Germany(prov="HH")]) print(f"range: {s:%d.%m.%Y} - {e:%d.%m.%Y}") print("workdays w/o HH holidays:", ct)