Esempio n. 1
0
def load_datasets(config: configparser.ConfigParser, target_column: str) -> list:
    """
    Load datasets according to info specified in config file
    :param config: config with dataset specific info
    :param target_column: target_column for prediction
    :return: list of datasets to use for optimization
    """
    datasets_lst = list()
    # load and name raw dataset
    dataset_raw = pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                              sep=';', decimal=',', index_col=0)
    try:
        dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%d.%m.%Y')
    except:
        dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%Y-%m-%d')
    dataset_raw = dataset_raw.asfreq('D')
    dataset_raw.name = config[target_column]['dataset_raw']
    datasets_lst.append(dataset_raw)

    # split dataset at before_break_date
    if 'before_break_date' in config[target_column]:
        dataset_before_break = dataset_raw.copy()
        dataset_before_break.name = dataset_raw.name + '_before_break'
        before_break_date = datetime.datetime.strptime(config[target_column]['before_break_date'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(df=dataset_before_break, start=before_break_date,
                                             end=dataset_before_break.index[-1])
        datasets_lst.append(dataset_before_break)

    return datasets_lst
 def insample(self, train: pd.DataFrame) -> pd.DataFrame:
     """
     Deliver (back-transformed) insample predictions
     :param train: train set
     :return: DataFrame with insample predictions
     """
     train_exog = None
     if self.use_exog:
         train_exog = train.drop(labels=[self.target_column], axis=1)
         PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped)
         train_exog = train_exog.to_numpy(dtype=float)
     insample = pd.DataFrame(
         data=self.model.predict_in_sample(exogenous=train_exog),
         index=train.index,
         columns=['Insample'])
     if self.power_transformer is not None:
         insample = pd.DataFrame(
             data=self.power_transformer.inverse_transform(
                 insample['Insample'].values.reshape(-1, 1)),
             index=insample.index,
             columns=['Insample'])
     if self.log:
         if 0 in train[self.target_column].values:
             self.contains_zeros = True
             insample = np.exp(insample) - 1
         else:
             insample = np.exp(insample)
     return insample
 def train(self, train: pd.DataFrame, cross_val_call: bool = False) -> dict:
     """
     Train (S)ARIMA(X) model
     :param train: train set
     :param cross_val_call: called to perform cross validation
     :return dictionary with cross validated scores (if specified)
     """
     cross_val_score_dict = {}
     if cross_val_call:
         cross_val_score_dict, self.model = self.get_cross_val_score(
             train=train)
     train_exog = None
     if (self.power_transformer is not None) or self.log:
         train = TrainHelper.get_transformed_set(
             dataset=train,
             target_column=self.target_column,
             power_transformer=self.power_transformer,
             log=self.log)
     if self.use_exog:
         train_exog = train.drop(labels=[self.target_column], axis=1)
         self.exog_cols_dropped = train_exog.columns[
             train_exog.isna().any()].tolist()
         PreparationHelper.drop_columns(train_exog, self.exog_cols_dropped)
         train_exog = train_exog.to_numpy(dtype=float)
     self.model.fit(y=train[self.target_column],
                    exogenous=train_exog,
                    trend=self.trend)
     return cross_val_score_dict
Esempio n. 4
0
def add_public_holiday_counters(dataset: pd.DataFrame, event_lags: list,
                                special_days: list):
    """
    Function adding counters for upcoming or past public holidays (according to event_lags)
    with own counters for those specified in special_days
    :param dataset: dataset for adding features
    :param event_lags: lags before and after holiday to add
    :param special_days: list of days with their own counter as feature
    """
    for index, row in dataset.iterrows():
        holiday = row['public_holiday']
        if holiday != 'no':
            for lag in event_lags:
                if (index + pd.Timedelta(days=lag)) in dataset.index:
                    dataset.at[index + pd.Timedelta(days=lag),
                               'cal_PublicHoliday_Counter'] = -lag
                    if holiday in special_days:
                        dataset.at[index + pd.Timedelta(days=lag),
                                   'cal_' + holiday + '_Counter'] = -lag
    PreparationHelper.drop_columns(df=dataset, columns=['public_holiday'])
    dataset[[col for col in dataset.columns if 'Counter' in col]] = \
        dataset[[col for col in dataset.columns if 'Counter' in col]].fillna(value=99)
Esempio n. 5
0
def get_ready_train_test_lst(dataset: pd.DataFrame, config: configparser.ConfigParser,
                             init_train_len: int, test_len: int, split_perc: float, imputation: str, target_column: str,
                             reset_index: bool = False, dimensionality_reduction: str = None,
                             featureset: str = 'full') -> list:
    """
    Function preparing train and test sets for training based on raw dataset:
    - Missing Value imputation
    - Feature Extraction
    (- Resampling if specified)
    - Deletion of non-target sales columns
    - Split into train and test set(s)
    :param dataset: dataset with raw samples
    :param config: config with dataset specific info
    :param init_train_len: length of first train set
    :param test_len: usual length of test set (could be shorter for last test set)
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method to use
    :param target_column: target_column used for predictions
    :param reset_index: reset_index of dataset (relevant for Exponential Smoothing)
    :param dimensionality_reduction: perform dimensionality reduction
    :param featureset: featureset to use ('full', 'cal', 'stat', 'none')
    :return: list with train and test set(s)
    """
    print('##### Preparing Train and Test Sets #####')
    # get dataset specific parameters
    seasonal_periods = config[target_column].getint('seasonal_periods')
    features_for_stats = config[target_column]['features_for_stats'].replace(" ", "").split(',')
    resample_weekly = config[target_column].getboolean('resample_weekly')
    possible_target_cols = config[target_column]['possible_target_cols'].replace(" ", "").split(',')
    cols_to_condense, condensed_col_name = None, None
    # use stat and cal features according to specified featureset
    stat_features = True
    cal_features = True
    if featureset == 'none':
        stat_features = False
        cal_features = False
    elif featureset == 'cal':
        stat_features = False
    elif featureset == 'stat':
        cal_features = False

    if 'cols_to_condense' in config[target_column]:
        cols_to_condense = config[target_column]['cols_to_condense'].replace(" ", "").split(',')
        condensed_col_name = config[target_column]['condensed_col_name']

    # load train and test set with missing values
    train_test_list_mv = get_train_test_lst(dataset=dataset, init_train_len=init_train_len, test_len=test_len,
                                            split_perc=split_perc)
    train_test_list = list()
    counter_list_tuple = 0
    for train_mv, test_mv in train_test_list_mv:
        # impute dataset according to fitting on train set with missing values
        if imputation is not None:
            dataset_imputed, _, _ = impute_dataset_train_test(imputation=imputation, dataset=dataset,
                                                              train=train_mv, test=test_mv)
        else:
            dataset_imputed = dataset.copy()
        MixedHelper.set_dtypes(df=dataset_imputed, cols_to_str=['public_holiday', 'school_holiday'])

        # feature extraction on imputed dataset
        if resample_weekly:
            # stats features after resampling, if resampling is done, to avoid information leak due to resampling
            FeatureAdder.add_features(dataset=dataset_imputed, cols_to_condense=cols_to_condense,
                                      condensed_col_name=condensed_col_name, use_stat_features=False,
                                      use_calendar_features=cal_features)
        else:
            FeatureAdder.add_features(dataset=dataset_imputed, cols_to_condense=cols_to_condense,
                                      condensed_col_name=condensed_col_name, seasonal_periods=seasonal_periods,
                                      features_for_stats=features_for_stats, use_stat_features=stat_features,
                                      use_calendar_features=cal_features)

        dataset_feat = PreparationHelper.get_one_hot_encoded_df(
            df=dataset_imputed,
            columns_to_encode=list(set(dataset_imputed.columns).intersection(['public_holiday', 'school_holiday'])))
        dataset_feat.dropna(subset=[target_column], inplace=True)

        # resample if specified
        if resample_weekly:
            dataset_feat = dataset_feat.resample('W').apply(
                lambda x: PreparationHelper.custom_resampler(arraylike=x, summation_cols=possible_target_cols)
            )
            if 'cal_date_weekday' in dataset_feat.columns:
                PreparationHelper.drop_columns(df=dataset_feat, columns=['cal_date_weekday'])
            # drop rows added due to resampling of quarter dataset
            dataset_feat.dropna(inplace=True)
            init_train_len = int(train_test_list_mv[0][0].shape[0] / 7)
            test_len = int(train_test_list_mv[0][1].shape[0] / 7)
            seasonal_periods = int(seasonal_periods / 7)
            if stat_features:
                FeatureAdder.add_features(dataset=dataset_feat, seasonal_periods=seasonal_periods,
                                          features_for_stats=features_for_stats,
                                          use_calendar_features=False, with_weekday_stats=False,
                                          lags=[1, 4], windowsize_rolling=4, windowsize_rolling_seas=4)
                StatisticalFeatures.add_rolling_statistics_features(dataset=dataset_feat, windowsize=2,
                                                                    features=features_for_stats)
                StatisticalFeatures.add_rolling_seasonal_statistics_features(dataset=dataset_feat,
                                                                             windowsize=2,
                                                                             features=features_for_stats,
                                                                             seasonal_periods=seasonal_periods)

        # drop non-target columns
        cols_to_drop = possible_target_cols.copy()
        cols_to_drop.remove(target_column)
        PreparationHelper.drop_columns(df=dataset_feat, columns=cols_to_drop)

        # split into train and test set(s)
        if reset_index:
            dataset_feat.reset_index(drop=True, inplace=True)
        train_test_list_feat = get_train_test_lst(dataset=dataset_feat, init_train_len=init_train_len,
                                                  test_len=test_len, split_perc=split_perc)
        # impute missing values after adding statistical features (e.g. due to lagged features)
        if imputation is not None:
            _, train_feat_imp, test_feat_imp = impute_dataset_train_test(
                imputation=imputation, train=train_test_list_feat[counter_list_tuple][0],
                test=train_test_list_feat[counter_list_tuple][1])
        else:
            train_feat_imp = train_test_list_feat[counter_list_tuple][0]
            test_feat_imp = train_test_list_feat[counter_list_tuple][1]

        # perform dimensionality reduction if specified
        if not train_feat_imp.isna().any().any() and dimensionality_reduction == 'pca':
            train_feat_imp, test_feat_imp = pca_transform_train_test(train=train_feat_imp, test=test_feat_imp,
                                                                     target_column=target_column)
        train_test_list.append((train_feat_imp, test_feat_imp))
        if len(train_test_list_feat) > 1:
            # special treatment for time series split with multiple train test pairs in train_test_list
            # first iteration of for loop: imputation based on train_1, second iteration: imputation based on train_2
            # in both cases creation of multiple (train, test) pairs based on imputed dataset
            # only the one related to the set used for imputation shall be kept
            counter_list_tuple += 1
        print('##### Prepared Train and Test Sets #####')
    return train_test_list
Esempio n. 6
0
def load_datasets(config: configparser.ConfigParser, company: str,
                  target_column: str) -> list:
    """
    Load datasets according to info specified in config file
    :param config: config with dataset specific info
    :param company: name of the company related to the dataset
    :param target_column: target_column for prediction
    :return: list of datasets to use for optimization
    """
    datasets_lst = list()
    # load and name raw dataset
    try:
        dataset_raw = \
            pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                        sep=';', decimal=',', index_col=0)
    except:
        dataset_raw = \
            pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                        sep=';', decimal='.', index_col=0)
    if type(dataset_raw.index[0]) == str:
        if '.' in dataset_raw.index[0]:
            dataset_raw.index = pd.to_datetime(dataset_raw.index,
                                               format='%d.%m.%Y')
        elif '-' in dataset_raw.index[0]:
            dataset_raw.index = pd.to_datetime(dataset_raw.index,
                                               format='%Y-%m-%d')
    # drop columns from raw dataset if not needed
    if 'raw_cols_to_drop' in config[target_column]:
        PreparationHelper.drop_columns(
            df=dataset_raw,
            columns=config[target_column]['raw_cols_to_drop'].replace(
                " ", "").split(','))
    PreparationHelper.drop_columns(
        df=dataset_raw,
        columns=[col for col in dataset_raw.columns if 'Unnamed' in col])
    # drop samples after start_date_to_drop if target_column is not recorded for whole dataset
    if 'start_date_to_drop' in config[target_column]:
        start_date_to_drop = datetime.datetime.strptime(
            config[target_column]['start_date_to_drop'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(df=dataset_raw,
                                             start=start_date_to_drop,
                                             end=dataset_raw.index[-1])

    if target_column in ['milk', 'beer', 'usdeaths']:
        dataset_raw = dataset_raw.apply(lambda x: x.str.replace(
            ',', '.').astype(float) if x.dtype == object else x)
    elif target_column == 'maunaloa_monthly':
        dataset_raw = dataset_raw.resample('M').apply(
            lambda x: PreparationHelper.custom_resampler(arraylike=x,
                                                         summation_cols=[]))
    elif target_column == 'VisitorNights':
        dataset_raw = dataset_raw.apply(lambda x: x.str.replace(
            ',', '.').astype(float) if x.dtype == object else x)
    dataset_raw.name = company + config[target_column]['dataset_raw']
    datasets_lst.append(dataset_raw)

    # split dataset at before_break_date
    if 'before_break_date' in config[target_column]:
        dataset_before_break = dataset_raw.copy()
        dataset_before_break.name = dataset_raw.name + '_before_break'
        before_break_date = datetime.datetime.strptime(
            config[target_column]['before_break_date'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(
            df=dataset_before_break,
            start=before_break_date,
            end=dataset_before_break.index[-1])
        datasets_lst.append(dataset_before_break)

    return datasets_lst
    def predict(self, test: pd.DataFrame, train: pd.DataFrame) -> pd.DataFrame:
        """
        Deliver (back-transformed), if specified one step ahead, out-of-sample predictions
        :param test: test set
        :param train: train set
        :return: DataFrame with predictions, upper and lower confidence level
        """
        test_exog = None
        if (self.power_transformer is not None) or self.log:
            test = TrainHelper.get_transformed_set(
                dataset=test,
                target_column=self.target_column,
                power_transformer=self.power_transformer,
                log=self.log,
                only_transform=True)
        if self.use_exog:
            test_exog = test.drop(labels=[self.target_column], axis=1)
            PreparationHelper.drop_columns(test_exog, self.exog_cols_dropped)
            test_exog = test_exog.to_numpy(dtype=float)
        if self.one_step_ahead:
            predict = []
            conf_low = []
            conf_up = []
            # deep copy model as predict function should not change class model
            model = copy.deepcopy(self.model)
            for i in range(0, test.shape[0]):
                if self.use_exog:
                    fc, conf = model.predict(n_periods=1,
                                             exogenous=pd.DataFrame(
                                                 test_exog[i].reshape(1, -1)),
                                             return_conf_int=True,
                                             alpha=0.05)
                    model.update(test[self.target_column][i],
                                 exogenous=pd.DataFrame(test_exog[i].reshape(
                                     1, -1)))
                else:
                    fc, conf = model.predict(n_periods=1,
                                             return_conf_int=True,
                                             alpha=0.05)
                    model.update(test[self.target_column][i])
                predict.append(fc[0])
                conf_low.append(conf[0][0])
                conf_up.append(conf[0][1])
        else:
            predict, conf = self.model.predict(n_periods=test.shape[0],
                                               exogenous=test_exog,
                                               return_conf_int=True,
                                               alpha=0.05)
            conf_low = conf[:, 0]
            conf_up = conf[:, 1]
        predictions = pd.DataFrame(
            {
                'Prediction': predict,
                'LowerConf': conf_low,
                'UpperConf': conf_up
            },
            index=test.index)

        if self.power_transformer is not None:
            predictions = pd.DataFrame(
                {
                    'Prediction':
                    self.power_transformer.inverse_transform(
                        predictions['Prediction'].values.reshape(-1,
                                                                 1)).flatten(),
                    'LowerConf':
                    self.power_transformer.inverse_transform(
                        predictions['LowerConf'].values.reshape(-1,
                                                                1)).flatten(),
                    'UpperConf':
                    self.power_transformer.inverse_transform(
                        predictions['UpperConf'].values.reshape(-1,
                                                                1)).flatten()
                },
                index=predictions.index)
        if self.log:
            predict_backtr = np.exp(predictions['Prediction'])
            if self.contains_zeros:
                predict_backtr += 1
            lower_dist = (
                (predictions['Prediction'] - predictions['LowerConf']) /
                predictions['Prediction']) * predict_backtr
            upper_dist = (
                (predictions['UpperConf'] - predictions['Prediction']) /
                predictions['Prediction']) * predict_backtr
            predictions = pd.DataFrame(
                {
                    'Prediction': predict_backtr,
                    'LowerConf': predict_backtr - lower_dist,
                    'UpperConf': predict_backtr + upper_dist
                },
                index=predictions.index)
        return predictions
def add_features(dataset: pd.DataFrame,
                 cols_to_condense: list = None,
                 condensed_col_name: str = None,
                 seasonal_periods: int = 0,
                 features_for_stats: list = None,
                 use_calendar_features: bool = True,
                 use_stat_features: bool = True,
                 event_lags: list = None,
                 special_days: list = None,
                 lags: list = None,
                 windowsize_rolling: int = 7,
                 windowsize_rolling_seas: int = 7,
                 windowsize_rolling_weekday: int = 4,
                 with_weekday_stats: bool = True):
    """
    Function adding all specified features to dataset
    :param dataset: dataset used for adding features
    :param cols_to_condense: cols which should be condensed to one column
    :param condensed_col_name: name of condensed column
    :param seasonal_periods: seasonality used for seasonal-based features
    :param features_for_stats: features used for calculating statistical features
    :param use_calendar_features: specify if calendar features should be added
    :param use_stat_features: specify if statistical features should be added
    :param event_lags: lags for event counter features
    :param special_days: days with their own event counter
    :param lags: lags to use for lagged sales numbers
    :param windowsize_rolling: windowsize used for rolling statistics
    :param windowsize_rolling_seas: windowsize used for rolling seasonal statistics
    :param windowsize_rolling_weekday: windowsize used for rolling statistics for each weekday
    :param with_weekday_stats: specify if weekday specific stats should be added
    """
    if event_lags is None:
        event_lags = [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3]
    if special_days is None:
        special_days = ['Valentine', 'MothersDay', 'Karfreitag']
    if lags is None:
        lags = [1, 2, 3, 4, 5, 6, 7]

    print('---Starting to add features---')
    if cols_to_condense is not None and condensed_col_name is not None:
        dataset[condensed_col_name] = 0
        for col in cols_to_condense:
            dataset[condensed_col_name] += dataset[col]
        PreparationHelper.drop_columns(df=dataset, columns=cols_to_condense)
    if use_calendar_features:
        print('---Adding calendar features---')
        add_calendar_features(dataset=dataset,
                              event_lags=event_lags,
                              special_days=special_days)
    if use_stat_features:
        print('---Adding statistical features---')
        add_statistical_features(
            dataset=dataset,
            seasonal_periods=seasonal_periods,
            features_for_stats=features_for_stats,
            lags=lags,
            windowsize_rolling=windowsize_rolling,
            windowsize_rolling_seas=windowsize_rolling_seas,
            windowsize_rolling_weekday=windowsize_rolling_weekday,
            with_weekday_stats=with_weekday_stats)
    print('---Features added---')