コード例 #1
0
def timedata(dt):
    # create time data extra information for influxDB (makes sorting easier)
    timestamp = datetime.fromtimestamp(dt)
    year = timestamp.year
    month = timestamp.month
    hour = timestamp.hour
    minute = timestamp.minute
    weekday = timestamp.weekday()

    # determine the holiday export weight
    # for DE only holiday, holiday = 1.0
    # for DE+ others holidays, holiday > 1.0
    # for only other holidays, holiday < 1.'
    holiday = 1 if holidays.Germany().get(timestamp) else 0  # DE holiday = 1
    for cn, op in holiday_op:
        if cn == 'DE': continue
        if op.get(timestamp):
            holiday += exportWeights[cn][year].get(month-1) \
                if exportWeights and exportWeights.get(cn) and exportWeights[cn].get(year) \
                else country_pop[cn]/sum([pop for country,pop in country_pop.items() if country != 'DE'])

    return {
        'timestamp': timestamp,
        'year': year,
        'month': month,
        'hour': hour,
        'minute': minute,
        'weekday': weekday,
        'holiday': holiday
    }
コード例 #2
0
 def __add_holiday_features(df):
     df.index = pd.to_datetime(df.index)
     df_holidays = holidays.Germany()
     cet_dates = pd.Series(df.index.tz_convert("CET"), index=df.index)
     df["holiday"] = cet_dates.apply(lambda d: d in df_holidays)
     df["holiday"] = df["holiday"].astype(int)
     return df
コード例 #3
0
def impute_holiday(df):
    years = df.Date.dt.year.unique()
    national_holidays = [day for day in holidays.Germany(years=years)]

    missing_holiday = df.StateHoliday.isnull()
    holiday_date = df.Date.isin(national_holidays)

    df.loc[missing_holiday & holiday_date, 'StateHoliday'] = 'a'
    df.loc[missing_holiday & ~holiday_date, 'StateHoliday'] = '0'
    return df
コード例 #4
0
def add_bank_holidays(df: pd.DataFrame):
    de_holidays = holidays.Germany(years=2016)
    df["bank_holiday"] = 0
    for date in list(de_holidays.keys()):
        if date in df.index:
            df.bank_holiday.loc[date] = 1
    if datetime.date(2016, 8, 15) not in list(de_holidays.keys()) and datetime.date(2016, 8, 15) in df.index:
        # Manually adding 15/08 since it is not in the de_holidays dictionary
        df.bank_holiday.loc["2016-08-15"] = 1
    return df
コード例 #5
0
ファイル: vrr.py プロジェクト: ortenburger/GeoHack
def next_weekday_str(weekday, time_tamplate='%Y:%m:%d:%H:%M:%S'):
    germany_holidays = holidays.Germany()

    d = datetime.now()
    d = d.replace(hour=9, minute=0, second=0)

    day_number = 2
    next_day = next_weekday(d, 2)  # 0 = Monday, 1=Tuesday, 2=Wednesday...
    # This is some how is not working
    while next_day in germany_holidays:
        # print (next_day)
        # print (type(next_day))
        day_number += 7
        next_day = next_weekday(d, day_number)
    day_string = next_day.strftime(time_tamplate)
    return day_string
コード例 #6
0
 def __init__(self, inputCol=None, outputCol=None):
     super(TimestampTransformer, self).__init__()
     kwargs = self._input_kwargs
     self.setParams(**kwargs)
     self.day_to_str = {
         0: "Monday",
         1: "Tuesday",
         2: "Wednesday",
         3: "Thursday",
         4: "Friday",
         5: "Saturday",
         6: "Sunday"
     }
     self.bins = 24 * 4  # number of time bins per day
     # Note: bins must evenly divide 60
     self.minutes_per_bin = int((24. / float(self.bins)) * 60.)
     self.holidays = holidays.Germany(prov="BW")
コード例 #7
0
    'CH': 7.786,
    'CZ': 10.460,
    'DE': 81.750,
    'DK': 5.535,
    'FR': 64.610,
    'LU': 0.502,
    'NL': 16.570,
    'PL': 38.53,
    'SE': 9.341
}
holiday_op = [
    ('AT', holidays.Austria()),  # holidays for each country
    ('BE', holidays.Belgium()),  # implented this way because of bug in library
    ('CH', holidays.Switzerland()),
    ('CZ', holidays.Czech()),
    ('DE', holidays.Germany()),
    ('DK', holidays.Denmark()),
    ('FR', holidays.France()),
    ('LU', holidays.Luxembourg()),
    ('NL', holidays.Netherlands()),
    ('PL', holidays.Poland()),
    ('SE', holidays.Sweden())
]


def timedata(dt):
    # create time data extra information for influxDB (makes sorting easier)
    timestamp = datetime.fromtimestamp(dt)
    year = timestamp.year
    month = timestamp.month
    hour = timestamp.hour
コード例 #8
0
    'CH': 7.786,
    'CZ': 10.460,
    'DE': 81.750,
    'DK': 5.535,
    'FR': 64.610,
    'LU': 0.502,
    'NL': 16.570,
    'PL': 38.53,
    'SE': 9.341
}
holiday_op = [
    ('AT', holidays.Austria()),  # holidays for each country
    ('BE', holidays.Belgium()),  # implented this way because of bug in library
    ('CH', holidays.Switzerland()),
    ('CZ', holidays.Czech()),
    ('DE', holidays.Germany()),
    ('DK', holidays.Denmark()),
    ('FR', holidays.France()),
    ('LU', holidays.Luxembourg()),
    ('NL', holidays.Netherlands()),
    ('PL', holidays.Poland()),
    ('SE', holidays.Sweden())
]

#----------------------------------------------------------------------

logging.basicConfig(filename="./influxDBLoad.log",
                    filemode="a",
                    format='%(asctime)s %(message)s',
                    datefmt='%Y.%m.%d %H:%M:%S',
                    level=logging.INFO)
コード例 #9
0
import holidays

de_bw_holidays = holidays.Germany(state='BW', years=2021).items()
for date, name in sorted(de_bw_holidays):
    print(date, name)
コード例 #10
0
from datetime import date


### STATIC VARIABLES

today = date.today()

australia_holidays = holidays.Australia()
austria_holidays = holidays.Austria()
canada_holidays = holidays.Canada()
colombia_holidays = holidays.Colombia()
czech_holidays = holidays.Czech()
denmark_holidays = holidays.Denmark()
england_holidays = holidays.England()
europeancentralbank_holidays = holidays.EuropeanCentralBank()
germany_holidays = holidays.Germany()
ireland_holidays = holidays.Ireland()
mexico_holidays = holidays.Mexico()
netherlands_holidays = holidays.Netherlands()
newzealand_holidays = holidays.NewZealand()
northernireland_holidays = holidays.NorthernIreland()
norway_holidays = holidays.Norway()
portugal_holidays = holidays.Portugal()
portugalext_holidays = holidays.PortugalExt()
scotland_holidays = holidays.Scotland()
spain_holidays = holidays.Spain()
unitedkingdom_holidays = holidays.UnitedKingdom()
unitedstates_holidays = holidays.UnitedStates()
wales_holidays = holidays.Wales()

def isDateHoliday(date, countryHolidays):
コード例 #11
0
 def __init__(self, inputCol=DEPARTURE, outputCol=BEFORE_HOLIDAY):
     super().__init__()
     self._setDefault(inputCol=inputCol, outputCol=outputCol)
     self._set(inputCol=inputCol, outputCol=outputCol)
     self.holiday = holidays.Germany(prov='BY')
コード例 #12
0
#nb different zip
#raw_users.zip.nunique()

#test=raw_users[raw_users.team != '202']
#test=raw_users[raw_users.zip == '86130']

users = raw_users.groupby('partition_date')['estimated_revenues'].sum()
py.plot([go.Scatter(x=users.index, y=users)])

users = pd.DataFrame(users).reset_index()
users.columns = ['ds', 'y']
users.head()

holidays_c = pd.DataFrame(
    holidays.Germany(years=[2015, 2016, 2017, 2018, 2019]),
    index=[0]).transpose().reset_index()
holidays_c.columns = ['ds', 'holiday']

m = Prophet(changepoint_prior_scale=2.5, holidays=holidays_c)
m.fit(users)
future = m.make_future_dataframe(periods=62)
forecast = m.predict(future)

py.plot([
    go.Scatter(x=users['ds'], y=users['y'], name='y'),
    go.Scatter(x=forecast['ds'], y=forecast['yhat'], name='yhat'),
    go.Scatter(x=forecast['ds'],
               y=forecast['yhat_upper'],
               fill='tonexty',
               mode='none',
コード例 #13
0
from datetime import date

import holidays

today = date.today()
is_holiday = today in holidays.Germany()

print(is_holiday)
コード例 #14
0
from load_data import LoadData

import matplotlib.pyplot as plt
import holidays
import pandas as pd

h = holidays.Germany()


def plot_df(df, **kwargs):
    dfy = df[df["Datum"].dt.year.between(2018, 2019)]
    # dfy = dfy[dfy["Datum"].dt.weekday < 5]
    # dfy = dfy[dfy.apply(lambda x: x["Datum"] not in h, axis=1)]
    dfy = dfy[dfy.apply(lambda x: x["Datum"] in h or x["Datum"].weekday() > 4,
                        axis=1)]
    dg = dfy.groupby(dfy["Datum"].dt.month).mean()
    plt.plot(dg, **kwargs)


pendlerstrecken = [1, 2, 4, 5, 6, 13]
freizeitstrecken = [7, 9, 10, 11, 12]
files = LoadData.load(freizeitstrecken)
avg_df = pd.DataFrame({"Datum": [], "Zaehlerstand": []})

for key, data in files.items():
    avg_df = avg_df.append(data)
    plot_df(data,
            label=LoadData.NAMINGS[key],
            linestyle="dotted",
            linewidth=1.5)
コード例 #15
0
    def validate(cls,args,data, rules=[], **kwargs):
        """
        This function performs data validation and saves invalid data records
        to a table. It works as a sort of filter for the calling script, by
          not allowing invalid data to be inserted into the database.
          It receives a set of data, performs validations on each record and
          on the whole set, and returns a tuple with the valid and invalid
           data to the calling script.

        First, the function will perform some generic data validation rules
        and then it will perform the data validation rules supplied as an
        argument.

        Args:
            - data (string): JSON string containing the data records to
            validate. It should be of the form:
                '{"data":[
                            {"column1":"value1",
                            "column2":"value2",
                            "column3":"value3"
                            ...},
                            {"column1":"value1",
                            "column2":"value2",
                            "column3":"value3"
                            ...},
                        ]

                }'

            - rules (list): list of functions, where each function takes a JSON string
          containing the data records to validate, performs a validation
            rule and returns a tuple, where the first element is the collection of valid
          records, and the second element is the collection of invalid records. These
          records should be returned as lists of
          dict with the data; and in the case of invalid records, the data will be a JSON
          String, and the dict will be
          augmented with the reason, the rule, the script, and the time of
          Validation. For example, the collection of invalid records might look like this:

          [{"data":'{"column1":"value1"...}', "rule":rule_value,
          "reason":reason_value, "script":script_value,
          "date_of_validation":date_value
          },
          {"data":'{"column1":"value1"...}', "rule":rule_value,
          "reason":reason_value, "script":script_value,
          "date_of_validation":date_value
          },
          ...
          ]

          The collection of valid records will be simpler, just a list of dict, e.g.
          [{"column1":value1, "column2":value2},
           {"column1":value1, "column2":value2}
          ]

            - kwargs:
                - 'google_key_path': the path to the service account key.

        Returns:
            tuple: tuple with: a list of dict with the valid records as the first element,
            and a list of dict with the invalid records as the second element.
            Each element of the invalid list is a tuple, containing the actual data (dict) as its first element,
            and another dict as its second element, with additional information like rule, reason, script,
            date_of_calidation and last_update_date. Exmple:

            Validate would return:

            (valid_list,invalid_list)

            where valid_list is:
            [
              {"column1":value1, "column2":value2...},
              {"column1":value1, "column2":value2...},
              ...
            ]

            where invalid_list:

            [
             (valid_record, additional_info),
             (valid_record,additional_info),
             ...
            ]

            where valid_record is:

            {"column1":value1, "column2":value2...}

            where additional_info is:

            {"rule":rule_value,
             "reason":reason_value,
             "script":script_value,
             "date_of_validation":date_of_validation_value,
             "last_update_date":last_update_date_value
             }

        """
        def rule_1(row, tc, ntc, uk, de, script):
            for c in ntc:
                if row[c] is not None and row[c] != 0:
                    return "valid"

            #All values were 0
            #Check if they are holiday or weekend
            for c in tc:
                try:
                    parsed = datetime.strptime(row[c], '%Y-%m-%d %H:%M:%S')
                except Exception as e:
                    continue
                parsed_date = str(parsed.date())
                if parsed_date in uk or parsed_date in de or not parsed.weekday():
                    if "price" in script:
                        return "valid"
                    else:
                        rule_list.add("rule_1")
                        return "invalid"

            #Not weekend or holiday
            rule_list.add("rule_1")
            return "invalid"

        def rule_4(row,nsc):
            for c in nsc:
                if row[c] != 0 and row[c] is not None:
                    return "valid"

            rule_list.add("rule_4")
            return "invalid"

        data_decoded = jsonpickle.decode(data)
        valid_data = []
        invalid_data = []

        rule_list = set()
        reasons = {
            "rule_1": "Rule 1: all columns (except timestamp) are null or 0.",
            "rule_2": "Rule 2: the 4000 previously-validated records are identical.",
            "rule_3": "Rule 3: the same column had value of 0 or NULL for the last 5 rows.",
            "rule_4": "Rule 4: standard columns like Constituent name, id, date have values but all others are 0."
        }

        #First, perform generic data validation rules
        #Rule 1: check if all columns (except timestamp) are null or 0
        #Get timestamp and non-timestamp columns
        script = kwargs["script"]
        uk_holidays = holidays.UK()
        de_holidays = holidays.Germany()
        df = pd.DataFrame(data_decoded["data"])
        standard_columns = ['constituent_name', "constituent_id", "date", "last_update_date"]
        original_columns = df.columns
        timestamp_columns = [c for c in df.columns if ("date" in c or "time" in c)]
        non_timestamp_columns = [c for c in df.columns if ("date" not in c and "time" not in c)]
        non_standard_columns = [c for c in df.columns if c not in standard_columns + timestamp_columns]

        df["rule_1"] = df.apply(lambda x: rule_1(x,timestamp_columns,non_timestamp_columns,uk_holidays,
                                                 de_holidays,script), axis=1)

        #Rule 2: Are the 4000 previously-validated records identical?
        invalid_indices = set()
        if df.shape[0] >= 4000:
            #We can apply rule
            start = 0
            end = 3999
            while end < df.shape[0]:
                df_temp = df.loc[start:end]
                df_duplicates = df_temp[df_temp.duplicated(subset=non_timestamp_columns, keep=False)]
                invalid_indices.update(list(df_duplicates.index))

                start += 1
                end += 1

        df["rule_2"] = "valid"
        df.loc[list(invalid_indices), "rule_2"] = "invalid"

        if len(invalid_indices) > 0:
            rule_list.add("rule_2")

        #Rule 3: Has the same column had value of 0 or NULL for the last 5 rows?
        invalid_indices_2 = set()
        start = 0
        end = 4
        while end < df.shape[0]:
            for c in non_timestamp_columns:
                series = df.loc[start:end][c]
                if series.any() == False:
                    invalid_indices_2.update(list(series.index))

            start += 1
            end += 1

        df["rule_3"] = "valid"
        df.loc[list(invalid_indices_2), "rule_3"] = "invalid"

        if len(invalid_indices_2) > 0:
            rule_list.add("rule_3")

        #Rule 4: If standard columns like Constituent name, id, date have values but all others are 0 reject?

        df["rule_4"] = df.apply(lambda x: rule_4(x,non_standard_columns), axis=1)

        df["rule"] = object

        #Get invalid records
        invalid_indices = set()

        for i in range(0, df.shape[0]):
            row_rules = []
            row = df.iloc[i]

            for r in ["rule_1","rule_2", "rule_3", "rule_4"]:
                if row[r] == "invalid":
                    invalid_indices.add(i)
                    row_rules.append(r)

            df.at[i, 'rule'] = row_rules

        #Get valid indices
        valid_indices = set(df.index.tolist())
        valid_indices = valid_indices.difference(invalid_indices)

        valid_data += df.loc[list(valid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records')
        invalid_data += df.loc[list(invalid_indices)][original_columns.tolist() + ["rule"]].to_dict(orient='records')

        #Custom rules
        custom_invalid = []
        for func in rules:
            valid, invalid = func(jsonpickle.encode({"data":valid_data}))
            #print(invalid)
            #Check if valid
            valid_data = valid
            custom_invalid += invalid

        #Format invalid data
        invalid_data_store = []

        #Add rule and reason
        for item in invalid_data:
            if "rule_1" in item["rule"] or "rule_4" in item["rule"]:
                continue
            additional_info = {}
            additional_info["rule"] = item["rule"]
            additional_info["reason"] = [reasons[r] for r in item["rule"] if isinstance(item["rule"], list)]
            additional_info["script"] = script
            additional_info["date_of_validation"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            additional_info["last_update_date"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            invalid_data_store.append((item,additional_info))

        invalid_data_store += custom_invalid

        return list(valid_data), invalid_data_store
コード例 #16
0
#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p python3Packages.dateutil -p python3Packages.holidays

import argparse
import holidays
from dateutil.relativedelta import relativedelta
from dateutil.rrule import rrule, DAILY
from dateutil.parser import isoparse

parser = argparse.ArgumentParser()
parser.add_argument("dtstart")
parser.add_argument("months", type=int)
args = parser.parse_args()

s = isoparse(args.dtstart)
e = s + relativedelta(months=args.months) - relativedelta(days=1)

r = rrule(freq=DAILY, dtstart=s, until=e)

ct = len([d for d in r if d.weekday() < 5 and d not in holidays.Germany(prov="HH")])

print(f"range: {s:%d.%m.%Y} - {e:%d.%m.%Y}")
print("workdays w/o HH holidays:", ct)