コード例 #1
0
def augmentation(X, Y, noise = False, bootstrapping = True, noiseSTD = [0.1/2, 0.1/2, 0.01/2, 0.0002/2,0.01/2,0.02/2], nr_boot =1000, bootstrap_bl_size = 488, boot_freq = 100):
    
    if noise:
        Xn = X.copy()
        for i, j, k in np.ndindex(X.shape):
            Xn[i, j, k] += np.random.normal(0, 1)*noiseSTD[k] 

        X = np.vstack([X, Xn])
        Y = np.vstack([Y, Y])
        
    if bootstrapping:
        Xb = X.copy()
        pt = PowerTransformer(method='yeo-johnson', standardize=True)
        
        for i in range(Xb.shape[0]):
            pt.fit(Xb[i])
            lambda_param = pt.lambdas_
            transformed = pt.transform(Xb[i])
            result = seasonal_decompose(transformed, model='additive', freq=boot_freq)
            
            # Moving Block Bootstrap on Residuals
            bootstrapRes = MBB(bootstrap_bl_size, result.resid)
            for data in bootstrapRes.bootstrap(nr_boot):
                bs_x = data[0][0]
            
            reconSeriesYC = result.trend + result.seasonal + bs_x
            Xb[i] = pt.inverse_transform(reconSeriesYC)
        
        for i,j,k in np.ndindex(X.shape):
            if np.isnan(Xb[i,j,k]):
                Xb[i,j,k] = X[i,j,k]
        X = np.vstack([X, Xb])
        Y = np.vstack([Y, Y])

    return X, Y
コード例 #2
0
    def infer(self):

        train_pred = self.model.predict((self.X_train))
        val_pred = self.model.predict((self.X_val))
        test_pred = self.model.predict((self.X_test))
        print(
            "-----------------------------------------------------------------"
        )
        print("Training results", "\n")
        if self.transform is not None:
            scaler = PowerTransformer(method="box-cox")
            scaler.fit(np.array(self.train.actual_load).reshape(-1, 1))
            inv_train_pred = scaler.inverse_transform(
                np.array(train_pred).reshape(-1, 1))
            inv_val_pred = scaler.inverse_transform(
                np.array(val_pred).reshape(-1, 1))
            inv_test_pred = scaler.inverse_transform(
                np.array(test_pred).reshape(-1, 1))
            print(
                "Training error: ",
                mse(self.train.actual_load, inv_train_pred, squared=False),
            )
            print(
                "Validation error: ",
                mse(self.val.actual_load, inv_val_pred, squared=False),
            )
            print("Test error: ", mse(self.y_test,
                                      inv_test_pred,
                                      squared=False))
            print(
                "Note : The error printed above is calculated after the inverse transform of box-cox"
            )

        else:
            print("Training error: ",
                  mse(self.y_train, train_pred, squared=False))
            print("Validation error: ", mse(self.y_val,
                                            val_pred,
                                            squared=False))
            print("Test error: ", mse(self.y_test, test_pred, squared=False))
コード例 #3
0
class TargetPreprocessor(BaseEstimator, TransformerMixin):
    """ Stabilizes the variance of the target """
    def __init__(self):
        self.preprocessor = PowerTransformer()

    def fit(self,
            X: pd.Series,
            y: t.Optional[pd.Series] = None) -> 'TargetPreprocessor':
        self.preprocessor.fit(X.values.reshape(-1, 1), y)
        return self

    def transform(self, X: pd.Series) -> pd.Series:
        return pd.Series(data=self.preprocessor.transform(
            X.values.reshape(-1, 1)).flatten(),
                         name='loss',
                         index=X.index)

    def inverse_transform(self, X: pd.Series) -> pd.Series:
        return pd.Series(data=self.preprocessor.inverse_transform(
            X.values.reshape(-1, 1)).flatten(),
                         name='loss',
                         index=X.index)
コード例 #4
0
    valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_pwr)
    valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_pwr)

    model = LSTMUnivariada(df_treino)
    checkpoint = ModelCheckpoint('checkpoint_regressor_'+tributo+'_teste_power_transformer.hdf5', monitor='loss', verbose=2,
                                save_best_only=True, save_weights_only=False,
                                mode='auto', period=1)
    model.compile(optimizer=ko.Adam(lr=0.1), loss='mse')
    model.fit([np_dia_mes_treino, valor_arrecadacao_serie_temporal_lstm_treino], saida_treino, validation_data=([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste], saida_teste), 
              epochs=100, batch_size=50, callbacks=[checkpoint])
    
    # Carrega o melhor modelo salvo pelo Checkpoint
    model.load_weights('checkpoint_regressor_'+tributo+'_teste_power_transformer.hdf5')
    
    pwr_pred = model.predict([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste])    
    mae_pwr = mean_absolute_error(pwr_scaler.inverse_transform(saida_teste), pwr_scaler.inverse_transform(pwr_pred))
    print('O MAE para o tributo '+tributo+' usando o "Power Transformer" foi de '+str(mae_pwr))
    
    comparativo.loc[tributo, 'PowerTransformer'] = mae_pwr
    
# %%  Treina a rede neural LSTM com única variável quantitativa utilizando o Power Transformer como scaler, já que foi o de melhor desempenho

for tributo in pd_arrecad_diaria['Tributo'].unique():
    # Utiliza método que extrai o dataset de teste idêntico ao utilizado no Prophet
    df_treino, df_teste = LSTMUtil.gera_teste_identico_prophet(arrecad_diaria[tributo], pd_datas_testes.loc[tributo+' - Prophet - Univariável - Sem Remoção de Outliers', 'Inicio'], pd_datas_testes.loc[tributo+' - Prophet - Univariável - Sem Remoção de Outliers', 'Fim'])   
    
    print('Tributo ' + tributo + ' - Início DF teste : ' + str(
        df_teste.reset_index().loc[0, 'Data']) + ' Fim DF teste : ' + str(
        df_teste.reset_index().loc[len(df_teste) - 1, 'Data']))
    df_treino = LSTMUtil.transforma_dataframe(df_treino, 'Data')
    df_teste = LSTMUtil.transforma_dataframe(df_teste, 'Data')
コード例 #5
0
                np.dot(this_cov_test_prior[i, :, :],
                       np.transpose(this_H[i, :, :])),
                np.linalg.inv(this_G[i, :, :])), this_residual[i, :])
        this_cov_test_posterior[i, :] = this_cov_test_prior[i, :, :] - np.dot(
            np.dot(
                np.dot(
                    np.dot(this_cov_test_prior[i, :, :],
                           np.transpose(this_H[i, :, :])),
                    np.linalg.inv(this_G[i, :, :])), this_H[i, :, :]),
            this_cov_test_prior[i, :, :])

    this_mu_test_prior = this_mu_test_posterior
    this_cov_test_prior = this_cov_test_posterior

mu_test_posterior_inv = np.empty((5, 24))
mu_test_posterior_inv[0, :] = pt_no2_0.inverse_transform(
    this_mu_test_posterior[0, :].reshape(-1, 1)).reshape(-1)
mu_test_posterior_inv[1, :] = pt_no2_0.inverse_transform(
    this_mu_test_posterior[1, :].reshape(-1, 1)).reshape(-1)
mu_test_posterior_inv[2, :] = pt_no2_0.inverse_transform(
    this_mu_test_posterior[2, :].reshape(-1, 1)).reshape(-1)
mu_test_posterior_inv[3, :] = pt_no2_0.inverse_transform(
    this_mu_test_posterior[3, :].reshape(-1, 1)).reshape(-1)
mu_test_posterior_inv[4, :] = pt_no2_0.inverse_transform(
    this_mu_test_posterior[4, :].reshape(-1, 1)).reshape(-1)

std_test_posterior = np.empty((5, 24))
for i in range(5):
    std_test_posterior[i, :] = np.sqrt(
        np.diag(this_cov_test_posterior[i, :, :]))

low_test_posterior = this_mu_test_prior - 2 * std_test_posterior
コード例 #6
0
def player_arima(data,
                 player_name,
                 index='date',
                 feature='cumStatpoints',
                 forecast_from='2018-10-03',
                 transform='none',
                 player_id=None,
                 roster=None,
                 summary=False):
    """ performs Auto-ARIMA on a single player """
    # TODO: add logic for if the player ID is given but not a roster (use function in package)
    if player_id and roster:
        player_name = roster[roster['Unnamed: 0'] == player_id]
    player_df = data[data['name'] == player_name]
    player_df.drop_duplicates(subset='date', keep='first', inplace=True)
    player_train_df = player_df[player_df['date'] < forecast_from]
    player_test_df = player_df[player_df['date'] >= forecast_from]
    player_train_df = player_train_df.loc[:, [index, feature]]
    player_train_df = player_train_df.set_index(index, drop=True)
    if player_train_df.shape[0] == 0:
        st.write('{} is a rookie!'.format(player_name))
        return None
    if transform == 'log':
        # TODO: make this stat agnostic
        player_train_df.loc[:, 'logValues'] = np.log(player_train_df['cumStatpoints'])
    elif transform == 'yj':
        transformer = PowerTransformer()
        transformer.fit(player_train_df.values.reshape(-1, 1))
        player_train_df.loc[:, 'transformedValues'] = transformer \
                                                      .transform(
                                                          player_train_df['cumStatpoints'] \
                                                          .values.reshape(-1, 1))
        player_train_df.drop('cumStatpoints', axis=1, inplace=True)
    player_test_df = player_test_df.loc[:, [index, feature]]
    player_test_df = player_test_df.set_index(index, drop=True)
    # player_train_df = player_train_df[:'2018-10-03']
    # player_test_df = player_test_df['2018-10-03':]
    if player_test_df.shape[0] == 0:
        st.write('{} retired!'.format(player_name))
        return None
    start_time = time.time()
    st.write('Searching ARIMA parameters for {}...'.format(player_name))
    try:
        model = pm.auto_arima(player_train_df,
                              start_p=1,
                              start_q=1,
                              max_p=5,
                              max_q=5,
                              max_d=3,
                              m=3,
                              start_P=0,
                              start_Q=0,
                              seasonal=True,
                              information_criterion='aicc',
                              error_action='ignore',
                              suppress_warnings=True,
                              stepwise=True)
        st.write('Model built, fitting...')
        model.fit(player_train_df)
    except ValueError:
        st.write("{} doesn't have enough data!".format(player_name))
        return None
    except IndexError:
        st.write('Index error for {}'.format(player_name))
        return None
    except:
        st.write('Unhandled error for {}'.format(player_name))
        return None
    predictions, intervals = model.predict(n_periods=player_test_df.shape[0], return_conf_int=True)
    if transform == 'log':
        predictions = np.exp(predictions)
        intervals = np.exp(intervals)
    elif transform == 'yj':
        predictions = transformer.inverse_transform(predictions.reshape(-1, 1))
        low_intervals = transformer.inverse_transform(intervals[:, 0].reshape(-1, 1))
        high_intervals = transformer.inverse_transform(intervals[:, 1].reshape(-1, 1))
    end_time = time.time()
    if transform != 'yj':
        low_intervals = []
        high_intervals = []
        for low, high in intervals:
            low_intervals.append(low)
            high_intervals.append(high)
    prediction_residuals = calculate_test_residuals(predictions, player_test_df)
    if summary:
        st.text(model.summary())
    train_residuals = pd.DataFrame(model.resid())
    train_mfe, train_mae, train_rmse = calculate_errors(train_residuals)
    test_mfe, test_mae, test_rmse = calculate_errors(prediction_residuals)
    model_params = model.get_params()
    p, d, q = model_params['order']
    try:
        P, D, Q, m = model_params['seasonal_order']
    except TypeError:
        st.write('Search failed to find valid options.')
        return None
    st.write("{0}'s Auto-ARIMA({1},{2},{3})({4},{5},{6},{7}) took {8:.3f} seconds." \
             .format(player_name, p, d, q, P, D, Q, m, end_time-start_time))
    results_df = pd.DataFrame({'forecastStart':forecast_from,
                               'aic':model.aic(),
                               'p':p,
                               'd':d,
                               'q':q,
                               'P':P,
                               'D':D,
                               'Q':Q,
                               'm':m,
                               'trainMfe':train_mfe,
                               'trainMae':train_mae,
                               'trainRmse':train_rmse,
                               'trainResiduals':[train_residuals],
                               'testMfe':test_mfe,
                               'testMae':test_mae,
                               'testRmse':test_rmse,
                               'testResiduals':[prediction_residuals],
                               'intervalLow':[low_intervals],
                               'intervalHigh':[high_intervals]},
                              index=[player_name])
    return results_df
コード例 #7
0
def get_outliers(
        data, STD_NORM, side, METHOD='yeo-johnson',
        PLOT=False, title=None, title_fontsize=None,
        x_label=None, y_label=None, label_fontsize=None
):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.preprocessing import PowerTransformer
    from statsmodels.graphics.gofplots import qqplot
    import colourPals as cp
    import importlib
    importlib.reload(cp)
    # ==================================================
    # Error checking
    assert side == 'left' or side == 'right', "'side' argument has to be either 'left' or 'right'"
    # ==================================================
    # If minimum text is less than zero, and 'box-cox' is selected, compute constant k to shift the text cos that the transformation can be performed.
    if METHOD == 'box-cox' and min(data) <= 0:
        k = 1 - min(data)
        data = data + k

    # ----- Transform text
    pt = PowerTransformer(method=METHOD)
    # Find optimal lambda value for transform
    pt.fit(data.to_numpy().reshape(-1, 1))
    # Transform text to a normal distribution
    data_trans = pt.transform(data.to_numpy().reshape(-1, 1))

    # ----- Compute threshold to remove text above or below threshold
    data_trans_thres = data_trans.mean() + STD_NORM*data_trans.std()
    # Transform threshold back to original distribution
    data_thres = pt.inverse_transform(np.array(data_trans_thres).reshape(1, -1))
    data_thres = data_thres.flatten()[0]

    # If text was shifted before, shift the text back by the same constant.
    if 'k' in locals():
        data_thres = data_thres - k
        data = data - k

    # If normalised standard deviation is less than 0, remove negative end of the text.
    # If normalised standard deviation is more than or equal to 0, remove positive end of the text.
    if side == 'left':
        outliers = data[data < data_thres]
    elif side == 'right':
        outliers = data[data > data_thres]
    else:
        raise ValueError("Argument side has to be 'left'or 'right' ")

    # Flatten can covert transformed text to a series
    data_trans = pd.Series(data_trans.flatten())

    if PLOT:
        FIG_SIZE = 3
        sns.set_style("darkgrid")
        sns.set_context("notebook")
        fig, ax = plt.subplots(nrows=3, figsize=(FIG_SIZE*2, FIG_SIZE*3), dpi=300)

        # Plot coeffMax before transformation
        sns.distplot(data, rug=True, kde=False, ax=ax[0], color=cp.cbPaired['blue'])
        ax[0].axvline(x=data_thres, c=cp.cbPaired['red'])
        ax[0].set_title(title, fontsize=title_fontsize)
        ax[0].set_xlabel(x_label, fontsize=label_fontsize)
        ax[0].set_ylabel(f"Frequency", fontsize=label_fontsize)

        # Plot coeffMax after transformation
        sns.distplot(data_trans, rug=True, kde=False, ax=ax[1], color=cp.cbPaired['purple'])
        ax[1].axvline(x=data_trans_thres, c=cp.cbPaired['red'])
        ax[1].set_xlabel(f"{METHOD.capitalize()} Transformed", fontsize=label_fontsize)
        ax[1].set_ylabel(f"Frequency", fontsize=label_fontsize)

        # Plot qqplot of coeffMax after transformation
        qqplot(data_trans, ax=ax[2], line='s', color=cp.cbPaired['purple'])

        plt.tight_layout()
        plt.show()

    return outliers, data_thres
コード例 #8
0
ファイル: ml_introduction_2.py プロジェクト: sheviv/mirrors
target_processor = PowerTransformer().fit(shuffled_y[:train].reshape(-1, 1))
transformed_y = target_processor.transform(shuffled_y.reshape(-1, 1)).flatten()
plt.title('Трансформированная целевая величина train/test')
sns.distplot(transformed_y[:train])
sns.distplot(transformed_y[train:])

from sklearn.metrics import mean_absolute_error
regressor = MLPRegressor(hidden_layer_sizes=[20, 20],
                         activation='relu',
                         max_iter=1000,
                         random_state=1)
regressor.fit(shuffled_X[:train], transformed_y[:train])
"R2 %.3f, ошибка в возрасте: %.2f, разброс значений возраста %.2f" % (
    r2_score(
        shuffled_y[train:],
        target_processor.inverse_transform(
            regressor.predict(shuffled_X[train:]).reshape(-1, 1))),
    mean_absolute_error(
        shuffled_y[train:],
        target_processor.inverse_transform(
            regressor.predict(shuffled_X[train:]).reshape(
                -1, 1))), shuffled_y[train:].std())

# Посмотрим теперь классификацию на датасете Iris
# classifier = MLPClassifier(
#     hidden_layer_sizes=[32, 12],
#     activation='tanh',
#     max_iter=1000,
#     random_state=1)
# X_changed = MinMaxScaler(
#     feature_range=(-1, 1)
# ).fit_transform(X)
コード例 #9
0
model.add(Dense(120))
model.add(Dense(10))
model.add(LSTM(48, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(3))
model.add(Dense(1))

model.compile(loss='mse', optimizer='ADAgrad')
history = model.fit(Xtrain,
                    Ytrain,
                    batch_size=41,
                    epochs=10000,
                    validation_data=(Xtest, Ytest))

Ypred = model.predict(X)
Ypred = ss.inverse_transform(Ypred)
#Ypred = scaler.inverse_transform(Ypred)
Ypred = np.reshape(Ypred, len(Ypred))
Ypred = pd.Series(Ypred)
Yreel = tt1[timestep:]
#test predict
Ypred_test = model.predict(Xtest)
Ypred_test = Ypred_test.reshape(len(Ypred_test), 1)
Ypred_test = ss.inverse_transform(Ypred_test)
Ypred_test = np.reshape(Ypred_test, len(Ypred_test))
Ypred_test = pd.Series(Ypred_test)
Yreel_test = tt1[46:]
#train predict
Ypred_train = model.predict(Xtrain)
Ypred_train = ss.inverse_transform(Ypred_train)
Ypred_train = np.reshape(Ypred_train, len(Ypred_train))
コード例 #10
0
ファイル: transform.py プロジェクト: wuzunzun/XenonPy
class PowerTransformer(BaseEstimator, TransformerMixin):
    """
    Box-cox transform.
    References
    ----------
    G.E.P. Box and D.R. Cox, “An Analysis of Transformations”,
    Journal of the Royal Statistical Society B, 26, 211-252 (1964).
    """
    def __init__(self,
                 *,
                 method='yeo-johnson',
                 standardize=False,
                 lmd=None,
                 tolerance=(-np.inf, np.inf),
                 on_err=None):
        """

        Parameters
        ----------
        method: 'yeo-johnson' or 'box-cox'
            ‘yeo-johnson’ works with positive and negative values
            ‘box-cox’ only works with strictly positive values
        standardize: boolean
            Normalize to standard normal or not.
            Recommend using a sepearate `standard` function instead of using this option.
        lmd: list or 1-dim ndarray
            You might assign each input xs with a specific lmd yourself.
            Leave None(default) to use a inferred value.
            See `PowerTransformer` for detials.
        tolerance: tuple
            Tolerance of lmd. Set None to accept any.
            Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform
        on_err: None or str
            Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string.
            **log** will return the logarithmic transform of xs that have a min shift to 1.
            **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``.
            **raise** raise a FloatingPointError. You can catch it yourself.
            Default(None) will return the input series without scale transform.
        .. _PowerTransformer:
            https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer
        """
        self._tolerance = tolerance
        self._pt = PT(method=method, standardize=standardize)
        self._lmd = lmd
        self._shape = None
        self._on_err = on_err

    def _check_type(self, x):
        if isinstance(x, list):
            x = np.array(x, dtype=np.float)
        elif isinstance(x, (DataFrame, Series)):
            x = x.values
        if not isinstance(x, np.ndarray):
            raise TypeError(
                'parameter `X` should be a `DataFrame`, `Series`, `ndarray` or list object '
                'but got {}'.format(type(x)))
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        return x

    def fit(self, x):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data used to compute the per-feature transformation

        Returns
        -------
        self : object
            Fitted scaler.
        """

        x = self._pt._check_input(self._check_type(x), in_fit=True)

        # forcing constant column vectors to have no transformation (lambda=1)
        idx = []
        for i, col in enumerate(x.T):
            if np.all(col == col[0]):
                idx.append(i)

        if self._lmd is not None:
            if isinstance(self._lmd, float):
                self._pt.lambdas_ = np.array([self._lmd] * x.shape[1])
            elif x.shape[1] != len(self._lmd):
                raise ValueError(
                    'shape[1] of parameter `X` should be {} but got {}'.format(
                        x.shape[1], len(self._lmd)))
            else:
                self._pt.lambdas_ = np.array(self._lmd)
        else:
            self._pt.fit(x)

        if len(idx) > 0:
            self._pt.lambdas_[idx] = 1.

        return self

    def transform(self, x):
        ret = self._pt.transform(self._check_type(x))
        if isinstance(x, pd.DataFrame):
            return pd.DataFrame(ret, index=x.index, columns=x.columns)
        return ret

    def inverse_transform(self, x):
        ret = self._pt.inverse_transform(self._check_type(x))
        if isinstance(x, pd.DataFrame):
            return pd.DataFrame(ret, index=x.index, columns=x.columns)
        return ret
コード例 #11
0
def inference(
    month: str,
    regr,
    df_test: pd.core.frame.DataFrame,
    transform: bool,
    path_df: str = None,
    test: bool = True,
):
    """This function generates inference files

    Parameters
    ----------
    month : str
        Corresponding month of the test file.
    regr :
        Trained ML model for inference generation
    df_test : pd.core.frame.DataFrame
        Test dataset used for inference generation
    transform : bool
        Whether to apply box-cox or not
    path_df : str
        Path to save the inference files. Defaults to None.
    test : bool
        If test files contains actual Fuel load values or not. Defaults to True.
    """

    df_test_pred = df_test
    if (
            test
    ):  # Condition for if the inference files contain true labels ,drop them from the dataframe to be used in prediction
        if transform:
            scaler = PowerTransformer(method="box-cox")
            scaler.fit_transform(np.array(df_test.actual_load).reshape(-1, 1))

        df_test_pred = df_test.drop(["actual_load"], axis=1)
    y_pred = regr.predict(df_test_pred)
    if test:
        if transform:
            y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1,
                                                                 1)).ravel()
        else:
            y_pred_inv = y_pred

        # If predicted fuel load values are below zero, using min-max normalization to change the prediction to the range of actual fuel load values
        if y_pred_inv.min() < 0:
            range_fl_predicted = max(y_pred_inv) - min(
                y_pred_inv)  # range of predicted fuel load values
            if range_fl_predicted != 0:
                y_pred_inv = (
                    y_pred_inv - min(y_pred_inv)
                ) / range_fl_predicted  # normalize predicted fuel load values based on its range
            range_fl_actual = max(df_test.actual_load) - min(
                df_test.actual_load)
            if range_fl_actual != 0:
                y_pred_inv = y_pred_inv * range_fl_actual + min(
                    df_test.actual_load
                )  # normalize predicted fuel load values based on actual fuel load range

        # Storing inference file as pandas dataframe
        output_df = pd.DataFrame(
            data={
                "lat":
                df_test.latitude,
                "lon":
                df_test.longitude,
                "actual_load":
                df_test.actual_load,
                "predicted_load":
                y_pred_inv,
                "APE": (np.abs((df_test.actual_load - y_pred_inv) /
                               df_test.actual_load)) * 100,
            })
        mape = (np.mean(
            np.abs((df_test.actual_load - y_pred_inv) / df_test.actual_load)) *
                100)
        if path_df is not None:
            output_df.to_csv(path_df, index=False)
        return mape

    else:
        scaler_filename = SCALER_FILENAME
        scaler = load(scaler_filename)  # Loading sklearn transformation
        if transform:
            y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1,
                                                                 1)).ravel()
        else:
            y_pred_inv = y_pred
        output_df = pd.DataFrame(
            data={
                "lat": df_test.latitude,
                "lon": df_test.longitude,
                "predicted_load": y_pred_inv,
            })
        if path_df is not None:
            output_df.to_csv(path_df, index=False)
コード例 #12
0
class linReg:
    def __init__(self, in_df):
        df = self.__imputeVals(in_df.copy())
        self.X = df.drop(columns=["SalePrice"]).copy()
        self.y = np.log(df.SalePrice.values.reshape(-1, 1))

        self._gridSearch = None
        self.pipeline_X = self.__make_pipe()
        #self.pipeline_y = StandardScaler()
        self.pipeline_y = PowerTransformer()
        self._searchSpace = None
        self._params = None
        self.lm = ElasticNet()

    def __imputeVals(self, in_df):
        return imputeVals(in_df)

    def __make_pipe(self):
        nonePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value="None"), OneHotEncoder(drop="first"))
        zeroPipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto"))
        scalePipeline = make_pipeline(SimpleImputer(
            strategy="constant", fill_value=0), PowerTransformer())

        regressionPipeline = ColumnTransformer([
            ("setNone", nonePipeline, fillNone),
            ("setZero", zeroPipeline, fillZeroCat),
            ("transformed", scalePipeline, fillZeroCont),
            ("dictImputed", make_pipeline(dictImputer(imputeDict),
                                          OneHotEncoder(drop="first")), list(imputeDict.keys())),
            ("bool", "passthrough", imputeBool),
            ("categoricalInts", "passthrough", cat_to_int),
            ("dropped", "drop", dropList)
        ], remainder="drop")
        return regressionPipeline

    def gridSearch(self, params, cv=5, njobs=-1, verbose=50):
        self._searchSpace = params
        #self._params = None

        piped_X = self.pipeline_X.fit_transform(self.X)
        piped_y = self.pipeline_y.fit_transform(self.y)
        self._gridSearch = GridSearchCV(
            self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose)
        self._gridSearch.fit(piped_X, piped_y)

    def getBestParams(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_params_
        else:
            raise ValueError()

    def getBestScore(self):
        if self._gridSearch is not None:
            return self._gridSearch.best_score_
        else:
            raise ValueError()

    def fitModel(self, params):
        piped_X = self.pipeline_X.fit_transform(self.X)
        piped_y = self.pipeline_y.fit_transform(self.y)
        self._params = params

        self.lm.set_params(**params)
        self.lm.fit(piped_X, piped_y)

    def __invert(self, y):
        return np.exp(self.pipeline_y.inverse_transform(y))

    def getTrainScore(self):
        piped_X = self.pipeline_X.transform(self.X)
        piped_y = self.pipeline_y.transform(self.y)
        return self.lm.score(piped_X, piped_y)

    # Root Mean Square Log Error
    def getRMSLE(self):
        piped_X = self.pipeline_X.transform(self.X)
        preds = self.lm.predict(piped_X).reshape(-1,1)
        preds = self.pipeline_y.inverse_transform(preds)
        return mean_squared_error(self.y,preds)

    def predict(self, test_X):
        piped_X = self.pipeline_X.transform(self.__imputeVals(test_X))
        preds = self.lm.predict(piped_X).reshape(-1,1)
        return self.__invert(preds)
コード例 #13
0
ファイル: DeepAR.py プロジェクト: AlokD123/SportsCast
class DeepAR(Model):
    def __init__(self,
                 train_ds_all: ListDataset,
                 model=None,
                 transform: str = 'none',
                 predictor=None):
        '''
        Parameters
        ----
        train_ds_all: a special ListDataset instance for training, as defined in MXNet's GluonTS package. ListDataset contains player_dict dictonaries for each player, as defined in Model.py

        model: optional pre-existing UNTRAINED model.

        predictor: optional pre-existing TRAINED model.

        transform: transform being applied during pre-/post-processing for ALL ARIMA models. Specify as string. Currently supports 'yj' and 'log' transforms
        '''
        super().__init__()
        self.data_train = train_ds_all
        self.estimator = model
        self.predictor = predictor

        #Add hparams
        self.transform = transform if self.estimator is None else None
        self.power_transformer = PowerTransformer(
        ) if self.estimator is None else None

    def create(
        self,
        data_train,  #ListDataset containing training data + metadata    #NOTE: provides: 'feat_dynamic_cat', 'feat_dynamic_real', 'feat_static_cat', 'name','start','target'
        save_path,  #Save location
        use_exog_feat=False,  #Whether or not to use the exogenous features for modelling
        num_epochs=50,  #Number of epochs to train
        lr=1e-3,  #Learning rate
        batch_size=64,  #Batch size
        scaling=False,  #Boolean indicating whether to scale data or not
        context_length=3,  #Number of samples to roll out LSTM/GRU
        num_layers=3,  #Number of RNN layers
        embedding_dimension=16,  #Dimension of embeddings layer
        context='cpu',  #GPU/CPU training setting
        prediction_length=82,  #Forecast horizong
        cardinality=None,  #Number of values in each categorical feature (inferred if None)
        lags_seq=None,  #Indices of the lagged target values to use as inputs of the RNN
        dropout_rate=0.1,  #Dropout rate
        num_cells=40,  #Number of cells in model
        cell_type='lstm',  #Type (LSTM or GRU)
        num_parallel_samples=100
    ):  #Number of parallel predictions to sample from learnt distribution
        '''
        Creates and a model for ALL the players in the training dataset

        Parameters
        ----
        As defined above

        Returns
        ----
        estimator: a DeepAREstimator instance ready to be trained
        '''

        self.data_train = data_train

        freq = data_train.list_data[0][
            'freq']  #Use metadata for arbitrary player to get frequency, since always same

        trainer = Trainer(batch_size=batch_size,
                          epochs=num_epochs,
                          learning_rate=lr,
                          ctx=context,
                          hybridize=False)
        estimator = DeepAREstimator(
            freq=freq,
            prediction_length=prediction_length,
            scaling=scaling,
            context_length=context_length,
            num_layers=num_layers,
            embedding_dimension=embedding_dimension,
            trainer=trainer,
            use_feat_dynamic_real=True if use_exog_feat else False,
            use_feat_static_cat=False,
            use_feat_static_real=False,
            cardinality=cardinality,
            lags_seq=lags_seq,
            dropout_rate=dropout_rate,
            num_cells=num_cells,
            cell_type=cell_type,
            num_parallel_samples=num_parallel_samples)

        self.estimator = estimator

        return estimator

    def preprocess(self, player_train_labels
                   ):  #self.power_transformer, transform, stand, scale
        '''
        Helper method to preprocess data for a SINGLE player

        Parameters
        ----
        player_train_labels: labels for a single player's training

        Returns
        ----
        preprocessed labels for training
        '''

        #By definition, only one col in df
        try:
            assert np.array(player_train_labels).shape[1] == 1
        except:
            logging.warn(f'Horizontal list?')
            assert np.array(player_train_labels).reshape(-1, 1) == len(
                np.array(player_train_labels))
            player_train_labels = np.array(player_train_labels).reshape(-1, 1)

        if self.transform == 'log':
            # TODO: make this stat agnostic
            player_train_labels.iloc[:,
                                     0] = np.log(player_train_labels.iloc[:,
                                                                          0])
        elif self.transform == 'yj':
            transformer = self.power_transformer
            transformer.fit(player_train_labels.iloc[:,
                                                     0].values.reshape(-1, 1))
            player_train_labels.iloc[:, 0] = transformer.transform(
                player_train_labels.iloc[:, 0].values.reshape(-1, 1))

        return player_train_labels

    def fit(self):
        '''
        Parameters
        -----
        None

        Returns
        -----
        predictor: trained model
        '''
        self.predictor = self.estimator.train(self.data_train)

    def predict(self, num_per=None, return_conf_int=True):
        '''
        Parameters
        -----
        num_per: unused, since constant
        return_conf_int: unused, since always True

        Returns
        -----
        pred_generator: predictions generator
        '''
        pred_generator = self.predictor.predict(self.data_train)
        return pred_generator
        #TODO: add boolPredictInsample option

    def postprocess(self, targets=None, predictions=None, intervals=None):
        #TODO: clean up
        #TODO: postprocess for scale/stand
        '''
        Helper method to postprocess data for a SINGLE player

        Parameters
        ----
        targets: labels for a single player's training

        intervals: see above

        predictions: see above

        Returns
        ----
        post-processed versions of each of the above
        '''

        for val in [targets, predictions]:
            if val is not None:

                #Reshape valiction vectors
                val = np.array(val).reshape(-1, 1)
                if len(np.array(val).shape) > 2:
                    val = np.array(val)[0]

                #Transform
                if self.transform == 'log':
                    val = np.exp(val)
                elif self.transform == 'yj':
                    val = self.power_transformer.inverse_transform(
                        val.reshape(-1, 1))

        if intervals is not None:
            #Reshape array of prediction confidence intervals
            intervals = np.array(intervals).reshape(-1, 2)
            if len(np.array(intervals).shape) > 3:
                intervals = np.array(intervals)[0]

            #Transform and decompose
            if self.transform == 'yj':
                low_intervals = self.power_transformer.inverse_transform(
                    intervals[:, 0].reshape(-1, 1))
                high_intervals = self.power_transformer.inverse_transform(
                    intervals[:, 1].reshape(-1, 1))
            else:
                if self.transform == 'log':
                    intervals = np.exp(intervals)
                else:
                    pass
                #Decompose into lower and upper bounds
                low_intervals = []
                high_intervals = []
                for low, high in intervals:
                    low_intervals.append(low)
                    high_intervals.append(high)

            return targets, predictions, low_intervals, high_intervals

        return targets, predictions, intervals

    def process_prediction(self, prediction):
        ''' Processes predictions for all players '''
        mean = prediction.mean_ts
        mean = mean.reset_index()
        mean = mean.rename(columns={0: 'predictions'})
        mean = mean.rename(columns={'index': 'date'})
        mean = mean.drop(columns=['date'])
        mean['gameNumber'] = mean.index + 1
        conf = pd.DataFrame()
        conf.loc[:, 'low'] = prediction.quantile('0.05')
        conf.loc[:, 'high'] = prediction.quantile('0.95')
        full_df = pd.concat([mean, conf], axis=1)
        return full_df

    def generate_prediction_df(self,
                               predictions,
                               data,
                               drop=True,
                               target='cumStatpoints',
                               scaled=None,
                               scaling_loc=None):
        ''' Postprocess predictions for ALL players and return as df '''
        if scaled is not None:
            scaling_meta = pd.read_pickle(scaling_loc)
            print(scaling_meta)
        names = data.loc[:, 'name'].unique()
        full_predictions = pd.DataFrame()
        for prediction, name in zip(
                predictions, names
        ):  #ONE FORECAST OF LENGTH prediction_length PER PLAYER, in order of data['name']
            player_df = pd.DataFrame()
            player_data = data.loc[data.loc[:, 'name'] == name].loc[:, [
                'name', 'gameNumber', target
            ]]  #DF OF 'name', 'date', 'cumStatpoints' for ONE PLAYER

            data_length = player_data.shape[0]
            prediction_df = self.process_prediction(prediction)
            if drop:
                prediction_df = prediction_df.iloc[:
                                                   data_length, :]  #Drop excess predictions if no data available for evaluation
            player_data.reset_index(drop=True, inplace=True)
            prediction_df.reset_index(drop=True, inplace=True)
            if scaled == 'ss':
                scale_data = scaling_meta.loc[scaling_meta.loc[:,
                                                               'name'] == name]
                for column in ['predictions', 'low', 'high']:
                    prediction_df.loc[:, column] = ((prediction_df.loc[:, column] * scale_data['maxabs']) \
                                                * scale_data['std']) + scale_data['mean']
            elif scaled == 'unit':
                scale_data = scaling_meta.loc[scaling_meta.loc[:,
                                                               'name'] == name]
                for column in ['predictions', 'low', 'high']:
                    prediction_df.loc[:, column] = (
                        prediction_df.loc[:, column] -
                        scale_data['min'].values) / scale_data['scale'].values

            player_data_df = pd.concat([player_data, prediction_df], axis=1)
            full_predictions = pd.concat([full_predictions, player_data_df])
        return full_predictions

    #NOTE: Not possible to implement at this point. See presentation for details
    def update(self, new_data_ds):
        pass

    #NOTE: not impelemented because update not possibel
    def evaluate(self, test_ds_all: ListDataset, horizon: int = 0):
        pass
class PreprocessData:
    def __init__(self,
                 preprocess_type=None,
                 extend_data=False,
                 short_end=False):

        self.config = Config()
        # prepare input data
        config_path = self.config.get_filepath("", "config.yaml")

        config_file = open(config_path, 'r')
        yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader)

        self.training_dataset_names = [
            d['name'] for d in yaml_config['training_datasets']
        ]
        self.training_dataset_start_pos = [
            d['start_position'] for d in yaml_config['training_datasets']
        ]
        self.test_dataset_names = [
            d['name'] for d in yaml_config['test_datasets']
        ]
        self.test_dataset_start_pos = [
            d['start_position'] for d in yaml_config['test_datasets']
        ]
        self.dataset_names = np.concatenate(
            (self.training_dataset_names,
             self.test_dataset_names))  # do we need these?
        self.dataset_start_pos = np.concatenate(
            (self.training_dataset_start_pos,
             self.test_dataset_start_pos))  # do we need these?

        # read in all pickle files
        self.all_pd = []
        for dataset_name in self.dataset_names:
            self.all_pd.append(
                pd.read_pickle(self.config.get_filepath_data(dataset_name)))

        if extend_data:
            training_dataset_names_copy = np.array(self.training_dataset_names,
                                                   copy=True)

            # create a copy of the data shifted up by 10
            for i, dataset_name in enumerate(training_dataset_names_copy):
                self.dataset_names = np.append(self.dataset_names,
                                               dataset_name + "_" + str(10))
                self.training_dataset_names = np.append(
                    self.training_dataset_names, dataset_name + "_" + str(10))
                self.dataset_start_pos = np.append(
                    self.dataset_start_pos, self.training_dataset_start_pos[i])
                self.training_dataset_start_pos.append(
                    self.training_dataset_start_pos[i])
                self.all_pd.append(self.all_pd[i].copy() + 10)

        self.dict_datasets = dict(
            zip(self.dataset_names, np.arange(len(self.dataset_names))))

        self.enable_difference = False

        self._feature_range = [0, 1]
        self.normalisation_scalers = []
        for _ in self.dataset_names:
            self.normalisation_scalers.append(
                MinMaxScaler(feature_range=self.feature_range))

        self.enable_normalisation_scaler = False
        self.enable_ignore_price = False  # scale each curve to feature_range

        self.power_transformer = PowerTransformer()
        self.enable_power_transform = False

        self.standardisation_scalers = []
        for _ in self.dataset_names:
            self.standardisation_scalers.append(StandardScaler())

        self.enable_standardisation_scaler = False

        self.enable_log_returns = False
        self.mult_factor = 10  # 5
        self.add_factor = 25  # 6

        self.enable_log = False
        self.enable_pct_change = False

        self.enable_curve_smoothing = False

        self.short_end = short_end

        # now setup PreprocessType settings
        if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
        elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
            self.enable_ignore_price = True
        elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS:
            self.enable_standardisation_scaler = True
        elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS:
            self.enable_log_returns = True

    @property
    def feature_range(self):  # implements the get - this name is *the* name
        return self._feature_range

    @feature_range.setter
    def feature_range(self, value):  # name must be the same
        self._feature_range = value
        for i, _ in enumerate(self.dataset_names):
            self.normalisation_scalers[i] = MinMaxScaler(feature_range=value)

    def get_data(self,
                 training_dataset_names=None,
                 test_dataset_names=None,
                 chunks_of=None):

        if training_dataset_names is None:
            training_dataset_names = self.training_dataset_names
        if isinstance(training_dataset_names, str):
            training_dataset_names = np.array([training_dataset_names])

        if test_dataset_names is None:
            test_dataset_names = self.test_dataset_names
        if test_dataset_names is None and self.test_dataset_names is None:
            test_dataset_names = []

        if isinstance(test_dataset_names, str):
            test_dataset_names = np.array([test_dataset_names])

        training_data = []
        test_data = []
        training_data_scaled = []
        test_data_scaled = []
        for key, value in self.dict_datasets.items():
            start_position = self.dataset_start_pos[value]
            end_position = None
            if chunks_of is not None:
                end_position = chunks_of * (
                    (self.all_pd[value].shape[0] - start_position) //
                    chunks_of)

            if key in training_dataset_names:
                # we take the log returns of each data set and scale wrt first dataset
                new_training_data = self.all_pd[value].copy(
                )[start_position:end_position]
                if self.short_end:
                    new_training_data = new_training_data.iloc[:, 0]

                new_training_data_scaled = self.scale_data(
                    new_training_data, value, True)

                training_data.append(new_training_data)
                training_data_scaled.append(new_training_data_scaled)

            if key in test_dataset_names:
                new_test_data = self.all_pd[value].copy(
                )[start_position:end_position]
                if self.short_end:
                    new_test_data = new_test_data.iloc[:, 0]

                new_test_data_scaled = self.scale_data(
                    new_test_data, value,
                    True)  # todo: should we scale test data wrt training data?

                test_data.append(new_test_data)
                test_data_scaled.append(new_test_data_scaled)

        maturities = self.all_pd[0].columns.values / (30 * 12)  # for years

        if test_dataset_names is not None:
            return training_data, test_data, training_data_scaled, test_data_scaled, training_dataset_names, test_dataset_names, maturities
        else:
            return training_data_scaled, maturities

    # def rescale_data_inputter(self, data, datasets=None):
    #     rescaled_data = []
    #     if datasets == "train":
    #         for i, name in enumerate(self.training_dataset_names):
    #             # pos = self.dict_datasets[name]
    #             rescaled_data.append(self.rescale_data(data[i], dataset_name=name))
    #
    #     elif datasets == "test":
    #         for i, name in enumerate(self.test_dataset_names):
    #             # pos = self.dict_datasets[name]
    #             # self.scale_data(self, data, dataset_num=pos)
    #             rescaled_data.append(self.rescale_data(data[i], dataset_name=name))
    #
    #     return rescaled_data

    def scale_data(self, data, dataset_name=None, should_fit=False):

        # if given a numpy array, convert it to a dataframe first
        if type(data) is np.ndarray:
            _data = pd.DataFrame(data=data)
        elif isinstance(data, list):
            _data_list = []
            # if isinstance(dataset_name, list):
            for _data, _dataset_name in zip(data, dataset_name):
                _data_list.append(
                    self.scale_data(_data, _dataset_name, should_fit))
            # else:
            #     for _data in data:
            #         _data_list.append(self.scale_data(_data, should_fit, dataset_name))
            return _data_list
        else:
            _data = data.copy()

        time = _data.axes[0].tolist()
        # maturities = _data.columns.values

        dataset_num = 999
        if dataset_name is not None:
            if isinstance(dataset_name, numbers.Integral):
                dataset_num = dataset_name
            else:
                for key, value in self.dict_datasets.items():
                    if key == dataset_name:
                        dataset_num = value

        if self.enable_log:
            _data = _data.apply(np.log)

        if self.enable_difference:
            _data = _data.diff(axis=1)
            _data = _data.fillna(0)

        if self.enable_pct_change:
            _data = _data.pct_change()
            _data = _data.fillna(0)

        if self.enable_log_returns:
            shift = (_data.shift(0) + self.add_factor) / (
                _data.shift(1) + self.add_factor
            )  # add 6 to make it non-negative, to take the log later
            shift = shift.dropna()

            if not (np.array(shift) > 0).all():
                # some values are non-positive... this will break the log
                print("NON-POSITIVE VALUES FOUND, CANNOT PASS THROUGH LOG!!")
                print(np.min(_data))
                print(shift)

            _data = self.mult_factor * np.log(shift)

            time = _data.axes[0].tolist()

        # now use only numpy, convert pandas to numpy array
        _data = _data.values

        if self.short_end and len(_data.shape) == 1:
            _data = _data.reshape(-1, 1)

        if self.enable_standardisation_scaler:
            if not self.enable_ignore_price:
                if should_fit:
                    self.standardisation_scalers[dataset_num].fit(_data)
                _data = self.standardisation_scalers[dataset_num].transform(
                    _data)
            else:
                data_temp = []
                for row in _data:
                    # row_as_2d = row.reshape(1, -1)
                    row_as_column = row[:, np.newaxis]
                    self.standardisation_scalers[dataset_num].fit(
                        row_as_column)
                    temp = self.standardisation_scalers[dataset_num].transform(
                        row_as_column)
                    data_temp.append(temp.ravel())
                _data = np.array(data_temp)

        if self.enable_normalisation_scaler:
            if not self.enable_ignore_price:
                if should_fit:
                    self.normalisation_scalers[dataset_num].fit(_data)
                _data = self.normalisation_scalers[dataset_num].transform(
                    _data)
            else:
                data_temp = []
                for row in _data:
                    # row_as_2d = row.reshape(1, -1)
                    row_as_column = row[:, np.newaxis]
                    self.normalisation_scalers[dataset_num].fit(row_as_column)
                    temp = self.normalisation_scalers[dataset_num].transform(
                        row_as_column)
                    data_temp.append(temp.ravel())
                _data = np.array(data_temp)

        if self.enable_power_transform:
            if should_fit:
                self.power_transformer.fit(_data)
            _data = self.power_transformer.transform(_data)

        df = pd.DataFrame(data=_data, index=np.array(time))

        return df

    def rescale_data(self,
                     data,
                     dataset_name=None,
                     start_value=None,
                     index=None,
                     columns=None):

        if isinstance(data, pd.DataFrame):
            if columns is None:
                columns = data.columns.values
            if index is None:
                index = data.index.values

        if type(data) is np.ndarray:
            temp_data = data
        else:
            temp_data = np.array(data)

        if self.short_end and len(temp_data.shape) == 1:
            temp_data = temp_data.reshape(-1, 1)

        dataset_num = 999
        if dataset_name is not None:
            for key, value in self.dict_datasets.items():
                if key == dataset_name:
                    dataset_num = value

        if self.enable_difference:
            temp_data = temp_data  # TODO: inverse difference

        if self.enable_power_transform:
            temp_data = self.power_transformer.inverse_transform(temp_data)

        if self.enable_normalisation_scaler:

            # we need to scale each rolling window manually
            if self.enable_ignore_price:
                # rescale each curve individually
                data_min = self.all_pd[dataset_num].min(axis=1)
                data_max = self.all_pd[dataset_num].max(axis=1)
                a = self.feature_range[0]
                b = self.feature_range[1]
                for i in np.arange(temp_data.shape[0]):
                    temp_data[i] = (
                        (temp_data[i] - a) /
                        (b - a)) * (data_max[i] - data_min[i]) + data_min[i]
            else:
                if len(temp_data.shape) == 3:
                    new_temp_data = []
                    for i in np.arange(temp_data.shape[0]):
                        new_temp_data.append(
                            self.normalisation_scalers[dataset_num].
                            inverse_transform(temp_data[i]))
                    temp_data = np.array(new_temp_data)

                else:
                    temp_data = self.normalisation_scalers[
                        dataset_num].inverse_transform(temp_data)

        if self.enable_standardisation_scaler:
            # temp_data = self.standardisation_scaler.inverse_transform(temp_data)
            if self.enable_ignore_price:
                raise NotImplementedError
            else:
                if len(temp_data.shape) == 3:
                    new_temp_data = []
                    for i in np.arange(temp_data.shape[0]):
                        new_temp_data.append(
                            self.standardisation_scalers[dataset_num].
                            inverse_transform(temp_data[i]))
                    temp_data = np.array(new_temp_data)

                else:
                    temp_data = self.standardisation_scalers[
                        dataset_num].inverse_transform(temp_data)

        if self.enable_log:
            temp_data = np.exp(temp_data)

        if self.enable_log_returns:

            # if start_value is not assigned but dataset_name is, use the first value of the dataset as start_value
            if dataset_name is not None and start_value is None:
                _start_value = self.all_pd[dataset_num].iloc[0]
            elif start_value is not None:
                _start_value = start_value
            else:
                _start_value = 1.

            # print("shapes, log-return rescale", temp_data.shape, _start_value.shape, _start_value[0].shape)

            if len(temp_data.shape) is 1:
                z = np.exp(temp_data / self.mult_factor)

                z = np.insert(
                    np.array(z), 0, _start_value[0] +
                    self.add_factor)  # instead of the usual _start_value
                temp_data = np.cumprod(z) - self.add_factor
                temp_data = pd.DataFrame(data=temp_data,
                                         index=self.all_pd[dataset_num].index)
                # print(temp_data.head(10))
            elif len(
                    temp_data.shape
            ) is 2:  # when taking log-returns on an individual batch, todo: check

                if self.short_end:
                    z = np.exp(temp_data / self.mult_factor)
                    z = np.insert(z,
                                  0,
                                  _start_value[0] + self.add_factor,
                                  axis=0)
                    temp_data = np.cumprod(z, axis=0) - self.add_factor
                else:
                    z = np.exp(temp_data / self.mult_factor)
                    z = np.insert(z, 0, _start_value + self.add_factor, axis=0)
                    temp_data = np.cumprod(z, axis=0) - self.add_factor

            elif len(temp_data.shape
                     ) > 2:  # when taking log-returns on multiple batches
                z = np.exp(temp_data[:, :] / self.mult_factor)
                z = np.insert(z, 0, _start_value + self.add_factor, axis=1)
                temp_data = np.cumprod(z, axis=1) - self.add_factor
            else:
                z = np.exp(temp_data[0, :] / self.mult_factor)
                z = np.insert(z, 0, _start_value + self.add_factor)
                temp_data = np.cumprod(z) - self.add_factor

            # print("log returns undo...", _start_value, temp_data[0])

        if self.enable_curve_smoothing:
            curve_smooth = []

            for curve in temp_data:
                curve_smooth.append(savgol_filter(
                    curve, 23, 5))  # window size 51, polynomial order 3
            temp_data = np.array(curve_smooth)

        if index is not None and columns is not None:
            return pd.DataFrame(temp_data, index=index, columns=columns)
        else:
            return temp_data
                            random_state=randomstate)
clf.fit(X_train, y_train)
# save classifier for further use
dump(clf, clfpath)
print("Training complete...")
# clf = load(clfpath)

# VALIDATION SET
# load validation data
validationfeatures = pd.read_csv(
    "/media/yannick/c4a7e8d3-9ac5-463f-b6e6-92e216ae6ac0/BRATS/BraTS2020/validationfeat_normalized.csv",
    index_col="ID")

y_pred_validation_tmp = clf.predict(validationfeatures)
y_pred_validation = np.squeeze(
    ptfm.inverse_transform(y_pred_validation_tmp.reshape(-1, 1)))
pred_validation_df = pd.DataFrame(data=zip(validationfeatures.index.values,
                                           y_pred_validation),
                                  columns=["ID", "Prediction"])
pred_validation_df.to_csv(os.path.join(
    outpath, "validationprediction_powertfm_FINAL.csv"),
                          header=False,
                          index=False)

# TESTING SET
# load test data
testfeatures = pd.read_csv(
    "/media/yannick/c4a7e8d3-9ac5-463f-b6e6-92e216ae6ac0/BRATS/BraTS2020/testingfeat_normalized_NEW.csv",
    index_col="BraTS20ID")

y_pred_test_tmp = clf.predict(testfeatures)
コード例 #16
0
def create_predictions_df(df, kmeans, knn):

    # create 2018 & 2019 masked DataFrame
    predictions = df[(df.year == 2018) | (df.year == 2019)]
    # create DataFrame for total units over 2018-2019
    total_units = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_units.sum())
    # create DataFrame for total buildings over 2018-2019
    total_bldgs = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_bldgs.sum())
    # create DataFrame for total value over 2018-2019
    total_value = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_value.sum())
    # merging total_units to predictions
    predictions = predictions.merge(total_units, how="left", on=["city", "state"], suffixes=("_og", "_1819"))
    # merging total_bldgs to predictions
    predictions = predictions.merge(total_bldgs, how="left", on=["city", "state"], suffixes=("_og", "_1819"))
    # merging total_values to predictions
    predictions = predictions.merge(total_value, how="left", on=["city", "state"], suffixes=("_og", "_1819"))

    # 2018-2019 total units and buildings needed to calculate the proper weighted average
    predictions = predictions.groupby("city_state")[["total_high_density_units_1819", "total_high_density_bldgs_1819"]].mean()

    # masking initial df variable for last two years 
    # grouping by city_state to get 130 unique observations
    # calc mean for ei, total buildings, and total valuation
    avgs = df[(df.year == 2018) | (df.year == 2019)].groupby("city_state")[["ei_x", "total_high_density_bldgs", "total_high_density_value"]].mean()
    
    # predictions["avg_units_per_bldg"] = predictions["total_high_density_units_1819"] / predictions["total_high_density_bldgs_1819"]
    # predictions.drop(columns="total_high_density_units_1819", inplace=True)

    # calc weighted average number of units per building over 2018-2019
    avgs["avg_units_per_bldg"] = predictions["total_high_density_units_1819"] / predictions["total_high_density_bldgs_1819"]

    # create object
    scaler = PowerTransformer()
    # fit object
    scaler.fit(avgs[["avg_units_per_bldg", "ei_x"]])
    # transform using object
    avgs[["avg_units_per_bldg", "ei_x"]] = scaler.transform(avgs[["avg_units_per_bldg", "ei_x"]])
    
    # define features for KMeans modeling
    X = avgs[["avg_units_per_bldg", "ei_x"]]

    avgs["cluster"] = kmeans.predict(X)

    avgs[["avg_units_per_bldg", "ei_x"]] = scaler.inverse_transform(avgs[["avg_units_per_bldg", "ei_x"]])
    
    scaler, avgs_scaled = min_max_scaler_prediction(avgs)

    avgs["label"] = knn.predict(avgs_scaled)

    city = avgs.reset_index().city_state.str.split("_", n=1, expand=True)[0]

    state = avgs.reset_index().city_state.str.split("_", n=1, expand=True)[1]

    avgs = avgs.reset_index()

    avgs["city"] = city

    avgs["state"] = state


    df_best = (
        avgs[(avgs.label) & ((avgs.cluster == 0) | (avgs.cluster == 4))]
    )

    df_high_density = (
        avgs[(avgs.label) & ((avgs.cluster == 5) | (avgs.cluster == 2))]
    )

    df_stable_high_markets = (
        avgs[(avgs.label) & ((avgs.cluster == 3) | (avgs.cluster == 1))]
    )

    df_best["recommendation_label"] = "Best_ROI"

    df_high_density["recommendation_label"] = "medium_ROI"

    df_stable_high_markets["recommendation_label"] = "Stable_High"

    avgs["recommendation_label"] = np.nan

    avgs.recommendation_label = avgs.recommendation_label.fillna(df_best.recommendation_label)

    avgs.recommendation_label = avgs.recommendation_label.fillna(df_high_density.recommendation_label)

    avgs.recommendation_label = avgs.recommendation_label.fillna(df_stable_high_markets.recommendation_label)

    avgs.recommendation_label = avgs.recommendation_label.fillna("Not Recommended to Enter")

    return avgs
コード例 #17
0
class DFPowerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = PowerTransformer(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis
        # Highly skewed:           -1   > Skewness > 1
        # Moderate skewed:         -0.5 < Skewness < -1
        #                           0.5 < Skewness < 1
        # Approximately symmetric: -0.5 < Skewness < 0.5
        skew_df = X[self.transform_cols].skew().to_frame(name='Skewness')
        # Normal distributed kurtosis: 3
        kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis')
        self.stat_df = skew_df.merge(kurt_df,
                                     left_index=True,
                                     right_index=True,
                                     how='left')

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.transform(
            X[self.transform_cols])

        # Transformed skewness & kurtosis
        skew_df = new_X[self.transform_cols].skew().to_frame(
            name='Skewness (Transformed)')
        kurt_df = new_X[self.transform_cols].kurt().to_frame(
            name='Kurtosis (Transformed)')
        stat_df = skew_df.merge(kurt_df,
                                left_index=True,
                                right_index=True,
                                how='left')
        self.stat_df = self.stat_df.merge(stat_df,
                                          left_index=True,
                                          right_index=True,
                                          how='left')

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.inverse_transform(
            X[self.transform_cols])

        return new_X
print(cross_val_score(elastic, X_train_sc, y_train_pt[:,0], cv = 5).mean())

# model fitting and evaluation:
ridge.fit(X_train_sc, y_train_pt);
print('ridge score on training set:', ridge.score(X_train_sc, y_train_pt))
print('ridge score on test set: ', ridge.score(X_test_sc, y_test_pt))

# predicting:
ridge_pred = ridge.predict(X_test_sc)

plt.hist(ridge_pred);
plt.title('ridge predictions, based on log-transformation'.title());

# to go back to originals:
# The .reshape(-1,1) method changes a numpy array into a numpy matrix with 1 column
ridge_pred_reversed = pt_y.inverse_transform(ridge_pred.reshape(-1,1))

plt.hist(ridge_pred_reversed);
plt.title('ridge predictions, back to original values'.title());

print('ridge score on target: ', r2_score(y_test, ridge_pred_reversed))

resid = y_test_pt - ridge_pred
plt.hist(resid);
plt.title('errors distribution of ridge prediction'.title());


test_data_sc = ss.transform(test_data)
saleprice = ridge.predict(test_data_sc)
plt.hist(saleprice); #after rescaling
plt.title('sale prices after log transformation'.title());