def augmentation(X, Y, noise = False, bootstrapping = True, noiseSTD = [0.1/2, 0.1/2, 0.01/2, 0.0002/2,0.01/2,0.02/2], nr_boot =1000, bootstrap_bl_size = 488, boot_freq = 100): if noise: Xn = X.copy() for i, j, k in np.ndindex(X.shape): Xn[i, j, k] += np.random.normal(0, 1)*noiseSTD[k] X = np.vstack([X, Xn]) Y = np.vstack([Y, Y]) if bootstrapping: Xb = X.copy() pt = PowerTransformer(method='yeo-johnson', standardize=True) for i in range(Xb.shape[0]): pt.fit(Xb[i]) lambda_param = pt.lambdas_ transformed = pt.transform(Xb[i]) result = seasonal_decompose(transformed, model='additive', freq=boot_freq) # Moving Block Bootstrap on Residuals bootstrapRes = MBB(bootstrap_bl_size, result.resid) for data in bootstrapRes.bootstrap(nr_boot): bs_x = data[0][0] reconSeriesYC = result.trend + result.seasonal + bs_x Xb[i] = pt.inverse_transform(reconSeriesYC) for i,j,k in np.ndindex(X.shape): if np.isnan(Xb[i,j,k]): Xb[i,j,k] = X[i,j,k] X = np.vstack([X, Xb]) Y = np.vstack([Y, Y]) return X, Y
def infer(self): train_pred = self.model.predict((self.X_train)) val_pred = self.model.predict((self.X_val)) test_pred = self.model.predict((self.X_test)) print( "-----------------------------------------------------------------" ) print("Training results", "\n") if self.transform is not None: scaler = PowerTransformer(method="box-cox") scaler.fit(np.array(self.train.actual_load).reshape(-1, 1)) inv_train_pred = scaler.inverse_transform( np.array(train_pred).reshape(-1, 1)) inv_val_pred = scaler.inverse_transform( np.array(val_pred).reshape(-1, 1)) inv_test_pred = scaler.inverse_transform( np.array(test_pred).reshape(-1, 1)) print( "Training error: ", mse(self.train.actual_load, inv_train_pred, squared=False), ) print( "Validation error: ", mse(self.val.actual_load, inv_val_pred, squared=False), ) print("Test error: ", mse(self.y_test, inv_test_pred, squared=False)) print( "Note : The error printed above is calculated after the inverse transform of box-cox" ) else: print("Training error: ", mse(self.y_train, train_pred, squared=False)) print("Validation error: ", mse(self.y_val, val_pred, squared=False)) print("Test error: ", mse(self.y_test, test_pred, squared=False))
class TargetPreprocessor(BaseEstimator, TransformerMixin): """ Stabilizes the variance of the target """ def __init__(self): self.preprocessor = PowerTransformer() def fit(self, X: pd.Series, y: t.Optional[pd.Series] = None) -> 'TargetPreprocessor': self.preprocessor.fit(X.values.reshape(-1, 1), y) return self def transform(self, X: pd.Series) -> pd.Series: return pd.Series(data=self.preprocessor.transform( X.values.reshape(-1, 1)).flatten(), name='loss', index=X.index) def inverse_transform(self, X: pd.Series) -> pd.Series: return pd.Series(data=self.preprocessor.inverse_transform( X.values.reshape(-1, 1)).flatten(), name='loss', index=X.index)
valor_arrecadacao_serie_temporal_lstm_treino = LSTMUtil.cria_intervalos_temporais(valor_treino_pwr) valor_arrecadacao_serie_temporal_lstm_teste = LSTMUtil.cria_intervalos_temporais(valor_teste_pwr) model = LSTMUnivariada(df_treino) checkpoint = ModelCheckpoint('checkpoint_regressor_'+tributo+'_teste_power_transformer.hdf5', monitor='loss', verbose=2, save_best_only=True, save_weights_only=False, mode='auto', period=1) model.compile(optimizer=ko.Adam(lr=0.1), loss='mse') model.fit([np_dia_mes_treino, valor_arrecadacao_serie_temporal_lstm_treino], saida_treino, validation_data=([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste], saida_teste), epochs=100, batch_size=50, callbacks=[checkpoint]) # Carrega o melhor modelo salvo pelo Checkpoint model.load_weights('checkpoint_regressor_'+tributo+'_teste_power_transformer.hdf5') pwr_pred = model.predict([np_dia_mes_teste, valor_arrecadacao_serie_temporal_lstm_teste]) mae_pwr = mean_absolute_error(pwr_scaler.inverse_transform(saida_teste), pwr_scaler.inverse_transform(pwr_pred)) print('O MAE para o tributo '+tributo+' usando o "Power Transformer" foi de '+str(mae_pwr)) comparativo.loc[tributo, 'PowerTransformer'] = mae_pwr # %% Treina a rede neural LSTM com única variável quantitativa utilizando o Power Transformer como scaler, já que foi o de melhor desempenho for tributo in pd_arrecad_diaria['Tributo'].unique(): # Utiliza método que extrai o dataset de teste idêntico ao utilizado no Prophet df_treino, df_teste = LSTMUtil.gera_teste_identico_prophet(arrecad_diaria[tributo], pd_datas_testes.loc[tributo+' - Prophet - Univariável - Sem Remoção de Outliers', 'Inicio'], pd_datas_testes.loc[tributo+' - Prophet - Univariável - Sem Remoção de Outliers', 'Fim']) print('Tributo ' + tributo + ' - Início DF teste : ' + str( df_teste.reset_index().loc[0, 'Data']) + ' Fim DF teste : ' + str( df_teste.reset_index().loc[len(df_teste) - 1, 'Data'])) df_treino = LSTMUtil.transforma_dataframe(df_treino, 'Data') df_teste = LSTMUtil.transforma_dataframe(df_teste, 'Data')
np.dot(this_cov_test_prior[i, :, :], np.transpose(this_H[i, :, :])), np.linalg.inv(this_G[i, :, :])), this_residual[i, :]) this_cov_test_posterior[i, :] = this_cov_test_prior[i, :, :] - np.dot( np.dot( np.dot( np.dot(this_cov_test_prior[i, :, :], np.transpose(this_H[i, :, :])), np.linalg.inv(this_G[i, :, :])), this_H[i, :, :]), this_cov_test_prior[i, :, :]) this_mu_test_prior = this_mu_test_posterior this_cov_test_prior = this_cov_test_posterior mu_test_posterior_inv = np.empty((5, 24)) mu_test_posterior_inv[0, :] = pt_no2_0.inverse_transform( this_mu_test_posterior[0, :].reshape(-1, 1)).reshape(-1) mu_test_posterior_inv[1, :] = pt_no2_0.inverse_transform( this_mu_test_posterior[1, :].reshape(-1, 1)).reshape(-1) mu_test_posterior_inv[2, :] = pt_no2_0.inverse_transform( this_mu_test_posterior[2, :].reshape(-1, 1)).reshape(-1) mu_test_posterior_inv[3, :] = pt_no2_0.inverse_transform( this_mu_test_posterior[3, :].reshape(-1, 1)).reshape(-1) mu_test_posterior_inv[4, :] = pt_no2_0.inverse_transform( this_mu_test_posterior[4, :].reshape(-1, 1)).reshape(-1) std_test_posterior = np.empty((5, 24)) for i in range(5): std_test_posterior[i, :] = np.sqrt( np.diag(this_cov_test_posterior[i, :, :])) low_test_posterior = this_mu_test_prior - 2 * std_test_posterior
def player_arima(data, player_name, index='date', feature='cumStatpoints', forecast_from='2018-10-03', transform='none', player_id=None, roster=None, summary=False): """ performs Auto-ARIMA on a single player """ # TODO: add logic for if the player ID is given but not a roster (use function in package) if player_id and roster: player_name = roster[roster['Unnamed: 0'] == player_id] player_df = data[data['name'] == player_name] player_df.drop_duplicates(subset='date', keep='first', inplace=True) player_train_df = player_df[player_df['date'] < forecast_from] player_test_df = player_df[player_df['date'] >= forecast_from] player_train_df = player_train_df.loc[:, [index, feature]] player_train_df = player_train_df.set_index(index, drop=True) if player_train_df.shape[0] == 0: st.write('{} is a rookie!'.format(player_name)) return None if transform == 'log': # TODO: make this stat agnostic player_train_df.loc[:, 'logValues'] = np.log(player_train_df['cumStatpoints']) elif transform == 'yj': transformer = PowerTransformer() transformer.fit(player_train_df.values.reshape(-1, 1)) player_train_df.loc[:, 'transformedValues'] = transformer \ .transform( player_train_df['cumStatpoints'] \ .values.reshape(-1, 1)) player_train_df.drop('cumStatpoints', axis=1, inplace=True) player_test_df = player_test_df.loc[:, [index, feature]] player_test_df = player_test_df.set_index(index, drop=True) # player_train_df = player_train_df[:'2018-10-03'] # player_test_df = player_test_df['2018-10-03':] if player_test_df.shape[0] == 0: st.write('{} retired!'.format(player_name)) return None start_time = time.time() st.write('Searching ARIMA parameters for {}...'.format(player_name)) try: model = pm.auto_arima(player_train_df, start_p=1, start_q=1, max_p=5, max_q=5, max_d=3, m=3, start_P=0, start_Q=0, seasonal=True, information_criterion='aicc', error_action='ignore', suppress_warnings=True, stepwise=True) st.write('Model built, fitting...') model.fit(player_train_df) except ValueError: st.write("{} doesn't have enough data!".format(player_name)) return None except IndexError: st.write('Index error for {}'.format(player_name)) return None except: st.write('Unhandled error for {}'.format(player_name)) return None predictions, intervals = model.predict(n_periods=player_test_df.shape[0], return_conf_int=True) if transform == 'log': predictions = np.exp(predictions) intervals = np.exp(intervals) elif transform == 'yj': predictions = transformer.inverse_transform(predictions.reshape(-1, 1)) low_intervals = transformer.inverse_transform(intervals[:, 0].reshape(-1, 1)) high_intervals = transformer.inverse_transform(intervals[:, 1].reshape(-1, 1)) end_time = time.time() if transform != 'yj': low_intervals = [] high_intervals = [] for low, high in intervals: low_intervals.append(low) high_intervals.append(high) prediction_residuals = calculate_test_residuals(predictions, player_test_df) if summary: st.text(model.summary()) train_residuals = pd.DataFrame(model.resid()) train_mfe, train_mae, train_rmse = calculate_errors(train_residuals) test_mfe, test_mae, test_rmse = calculate_errors(prediction_residuals) model_params = model.get_params() p, d, q = model_params['order'] try: P, D, Q, m = model_params['seasonal_order'] except TypeError: st.write('Search failed to find valid options.') return None st.write("{0}'s Auto-ARIMA({1},{2},{3})({4},{5},{6},{7}) took {8:.3f} seconds." \ .format(player_name, p, d, q, P, D, Q, m, end_time-start_time)) results_df = pd.DataFrame({'forecastStart':forecast_from, 'aic':model.aic(), 'p':p, 'd':d, 'q':q, 'P':P, 'D':D, 'Q':Q, 'm':m, 'trainMfe':train_mfe, 'trainMae':train_mae, 'trainRmse':train_rmse, 'trainResiduals':[train_residuals], 'testMfe':test_mfe, 'testMae':test_mae, 'testRmse':test_rmse, 'testResiduals':[prediction_residuals], 'intervalLow':[low_intervals], 'intervalHigh':[high_intervals]}, index=[player_name]) return results_df
def get_outliers( data, STD_NORM, side, METHOD='yeo-johnson', PLOT=False, title=None, title_fontsize=None, x_label=None, y_label=None, label_fontsize=None ): import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import PowerTransformer from statsmodels.graphics.gofplots import qqplot import colourPals as cp import importlib importlib.reload(cp) # ================================================== # Error checking assert side == 'left' or side == 'right', "'side' argument has to be either 'left' or 'right'" # ================================================== # If minimum text is less than zero, and 'box-cox' is selected, compute constant k to shift the text cos that the transformation can be performed. if METHOD == 'box-cox' and min(data) <= 0: k = 1 - min(data) data = data + k # ----- Transform text pt = PowerTransformer(method=METHOD) # Find optimal lambda value for transform pt.fit(data.to_numpy().reshape(-1, 1)) # Transform text to a normal distribution data_trans = pt.transform(data.to_numpy().reshape(-1, 1)) # ----- Compute threshold to remove text above or below threshold data_trans_thres = data_trans.mean() + STD_NORM*data_trans.std() # Transform threshold back to original distribution data_thres = pt.inverse_transform(np.array(data_trans_thres).reshape(1, -1)) data_thres = data_thres.flatten()[0] # If text was shifted before, shift the text back by the same constant. if 'k' in locals(): data_thres = data_thres - k data = data - k # If normalised standard deviation is less than 0, remove negative end of the text. # If normalised standard deviation is more than or equal to 0, remove positive end of the text. if side == 'left': outliers = data[data < data_thres] elif side == 'right': outliers = data[data > data_thres] else: raise ValueError("Argument side has to be 'left'or 'right' ") # Flatten can covert transformed text to a series data_trans = pd.Series(data_trans.flatten()) if PLOT: FIG_SIZE = 3 sns.set_style("darkgrid") sns.set_context("notebook") fig, ax = plt.subplots(nrows=3, figsize=(FIG_SIZE*2, FIG_SIZE*3), dpi=300) # Plot coeffMax before transformation sns.distplot(data, rug=True, kde=False, ax=ax[0], color=cp.cbPaired['blue']) ax[0].axvline(x=data_thres, c=cp.cbPaired['red']) ax[0].set_title(title, fontsize=title_fontsize) ax[0].set_xlabel(x_label, fontsize=label_fontsize) ax[0].set_ylabel(f"Frequency", fontsize=label_fontsize) # Plot coeffMax after transformation sns.distplot(data_trans, rug=True, kde=False, ax=ax[1], color=cp.cbPaired['purple']) ax[1].axvline(x=data_trans_thres, c=cp.cbPaired['red']) ax[1].set_xlabel(f"{METHOD.capitalize()} Transformed", fontsize=label_fontsize) ax[1].set_ylabel(f"Frequency", fontsize=label_fontsize) # Plot qqplot of coeffMax after transformation qqplot(data_trans, ax=ax[2], line='s', color=cp.cbPaired['purple']) plt.tight_layout() plt.show() return outliers, data_thres
target_processor = PowerTransformer().fit(shuffled_y[:train].reshape(-1, 1)) transformed_y = target_processor.transform(shuffled_y.reshape(-1, 1)).flatten() plt.title('Трансформированная целевая величина train/test') sns.distplot(transformed_y[:train]) sns.distplot(transformed_y[train:]) from sklearn.metrics import mean_absolute_error regressor = MLPRegressor(hidden_layer_sizes=[20, 20], activation='relu', max_iter=1000, random_state=1) regressor.fit(shuffled_X[:train], transformed_y[:train]) "R2 %.3f, ошибка в возрасте: %.2f, разброс значений возраста %.2f" % ( r2_score( shuffled_y[train:], target_processor.inverse_transform( regressor.predict(shuffled_X[train:]).reshape(-1, 1))), mean_absolute_error( shuffled_y[train:], target_processor.inverse_transform( regressor.predict(shuffled_X[train:]).reshape( -1, 1))), shuffled_y[train:].std()) # Посмотрим теперь классификацию на датасете Iris # classifier = MLPClassifier( # hidden_layer_sizes=[32, 12], # activation='tanh', # max_iter=1000, # random_state=1) # X_changed = MinMaxScaler( # feature_range=(-1, 1) # ).fit_transform(X)
model.add(Dense(120)) model.add(Dense(10)) model.add(LSTM(48, activation="relu")) model.add(Dropout(0.1)) model.add(Dense(3)) model.add(Dense(1)) model.compile(loss='mse', optimizer='ADAgrad') history = model.fit(Xtrain, Ytrain, batch_size=41, epochs=10000, validation_data=(Xtest, Ytest)) Ypred = model.predict(X) Ypred = ss.inverse_transform(Ypred) #Ypred = scaler.inverse_transform(Ypred) Ypred = np.reshape(Ypred, len(Ypred)) Ypred = pd.Series(Ypred) Yreel = tt1[timestep:] #test predict Ypred_test = model.predict(Xtest) Ypred_test = Ypred_test.reshape(len(Ypred_test), 1) Ypred_test = ss.inverse_transform(Ypred_test) Ypred_test = np.reshape(Ypred_test, len(Ypred_test)) Ypred_test = pd.Series(Ypred_test) Yreel_test = tt1[46:] #train predict Ypred_train = model.predict(Xtrain) Ypred_train = ss.inverse_transform(Ypred_train) Ypred_train = np.reshape(Ypred_train, len(Ypred_train))
class PowerTransformer(BaseEstimator, TransformerMixin): """ Box-cox transform. References ---------- G.E.P. Box and D.R. Cox, “An Analysis of Transformations”, Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ def __init__(self, *, method='yeo-johnson', standardize=False, lmd=None, tolerance=(-np.inf, np.inf), on_err=None): """ Parameters ---------- method: 'yeo-johnson' or 'box-cox' ‘yeo-johnson’ works with positive and negative values ‘box-cox’ only works with strictly positive values standardize: boolean Normalize to standard normal or not. Recommend using a sepearate `standard` function instead of using this option. lmd: list or 1-dim ndarray You might assign each input xs with a specific lmd yourself. Leave None(default) to use a inferred value. See `PowerTransformer` for detials. tolerance: tuple Tolerance of lmd. Set None to accept any. Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform on_err: None or str Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string. **log** will return the logarithmic transform of xs that have a min shift to 1. **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``. **raise** raise a FloatingPointError. You can catch it yourself. Default(None) will return the input series without scale transform. .. _PowerTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer """ self._tolerance = tolerance self._pt = PT(method=method, standardize=standardize) self._lmd = lmd self._shape = None self._on_err = on_err def _check_type(self, x): if isinstance(x, list): x = np.array(x, dtype=np.float) elif isinstance(x, (DataFrame, Series)): x = x.values if not isinstance(x, np.ndarray): raise TypeError( 'parameter `X` should be a `DataFrame`, `Series`, `ndarray` or list object ' 'but got {}'.format(type(x))) if len(x.shape) == 1: x = x.reshape(-1, 1) return x def fit(self, x): """ Parameters ---------- X : array-like of shape (n_samples, n_features) The data used to compute the per-feature transformation Returns ------- self : object Fitted scaler. """ x = self._pt._check_input(self._check_type(x), in_fit=True) # forcing constant column vectors to have no transformation (lambda=1) idx = [] for i, col in enumerate(x.T): if np.all(col == col[0]): idx.append(i) if self._lmd is not None: if isinstance(self._lmd, float): self._pt.lambdas_ = np.array([self._lmd] * x.shape[1]) elif x.shape[1] != len(self._lmd): raise ValueError( 'shape[1] of parameter `X` should be {} but got {}'.format( x.shape[1], len(self._lmd))) else: self._pt.lambdas_ = np.array(self._lmd) else: self._pt.fit(x) if len(idx) > 0: self._pt.lambdas_[idx] = 1. return self def transform(self, x): ret = self._pt.transform(self._check_type(x)) if isinstance(x, pd.DataFrame): return pd.DataFrame(ret, index=x.index, columns=x.columns) return ret def inverse_transform(self, x): ret = self._pt.inverse_transform(self._check_type(x)) if isinstance(x, pd.DataFrame): return pd.DataFrame(ret, index=x.index, columns=x.columns) return ret
def inference( month: str, regr, df_test: pd.core.frame.DataFrame, transform: bool, path_df: str = None, test: bool = True, ): """This function generates inference files Parameters ---------- month : str Corresponding month of the test file. regr : Trained ML model for inference generation df_test : pd.core.frame.DataFrame Test dataset used for inference generation transform : bool Whether to apply box-cox or not path_df : str Path to save the inference files. Defaults to None. test : bool If test files contains actual Fuel load values or not. Defaults to True. """ df_test_pred = df_test if ( test ): # Condition for if the inference files contain true labels ,drop them from the dataframe to be used in prediction if transform: scaler = PowerTransformer(method="box-cox") scaler.fit_transform(np.array(df_test.actual_load).reshape(-1, 1)) df_test_pred = df_test.drop(["actual_load"], axis=1) y_pred = regr.predict(df_test_pred) if test: if transform: y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1, 1)).ravel() else: y_pred_inv = y_pred # If predicted fuel load values are below zero, using min-max normalization to change the prediction to the range of actual fuel load values if y_pred_inv.min() < 0: range_fl_predicted = max(y_pred_inv) - min( y_pred_inv) # range of predicted fuel load values if range_fl_predicted != 0: y_pred_inv = ( y_pred_inv - min(y_pred_inv) ) / range_fl_predicted # normalize predicted fuel load values based on its range range_fl_actual = max(df_test.actual_load) - min( df_test.actual_load) if range_fl_actual != 0: y_pred_inv = y_pred_inv * range_fl_actual + min( df_test.actual_load ) # normalize predicted fuel load values based on actual fuel load range # Storing inference file as pandas dataframe output_df = pd.DataFrame( data={ "lat": df_test.latitude, "lon": df_test.longitude, "actual_load": df_test.actual_load, "predicted_load": y_pred_inv, "APE": (np.abs((df_test.actual_load - y_pred_inv) / df_test.actual_load)) * 100, }) mape = (np.mean( np.abs((df_test.actual_load - y_pred_inv) / df_test.actual_load)) * 100) if path_df is not None: output_df.to_csv(path_df, index=False) return mape else: scaler_filename = SCALER_FILENAME scaler = load(scaler_filename) # Loading sklearn transformation if transform: y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1, 1)).ravel() else: y_pred_inv = y_pred output_df = pd.DataFrame( data={ "lat": df_test.latitude, "lon": df_test.longitude, "predicted_load": y_pred_inv, }) if path_df is not None: output_df.to_csv(path_df, index=False)
class linReg: def __init__(self, in_df): df = self.__imputeVals(in_df.copy()) self.X = df.drop(columns=["SalePrice"]).copy() self.y = np.log(df.SalePrice.values.reshape(-1, 1)) self._gridSearch = None self.pipeline_X = self.__make_pipe() #self.pipeline_y = StandardScaler() self.pipeline_y = PowerTransformer() self._searchSpace = None self._params = None self.lm = ElasticNet() def __imputeVals(self, in_df): return imputeVals(in_df) def __make_pipe(self): nonePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), PowerTransformer()) regressionPipeline = ColumnTransformer([ ("setNone", nonePipeline, fillNone), ("setZero", zeroPipeline, fillZeroCat), ("transformed", scalePipeline, fillZeroCont), ("dictImputed", make_pipeline(dictImputer(imputeDict), OneHotEncoder(drop="first")), list(imputeDict.keys())), ("bool", "passthrough", imputeBool), ("categoricalInts", "passthrough", cat_to_int), ("dropped", "drop", dropList) ], remainder="drop") return regressionPipeline def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params #self._params = None piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._gridSearch = GridSearchCV( self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearch.fit(piped_X, piped_y) def getBestParams(self): if self._gridSearch is not None: return self._gridSearch.best_params_ else: raise ValueError() def getBestScore(self): if self._gridSearch is not None: return self._gridSearch.best_score_ else: raise ValueError() def fitModel(self, params): piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._params = params self.lm.set_params(**params) self.lm.fit(piped_X, piped_y) def __invert(self, y): return np.exp(self.pipeline_y.inverse_transform(y)) def getTrainScore(self): piped_X = self.pipeline_X.transform(self.X) piped_y = self.pipeline_y.transform(self.y) return self.lm.score(piped_X, piped_y) # Root Mean Square Log Error def getRMSLE(self): piped_X = self.pipeline_X.transform(self.X) preds = self.lm.predict(piped_X).reshape(-1,1) preds = self.pipeline_y.inverse_transform(preds) return mean_squared_error(self.y,preds) def predict(self, test_X): piped_X = self.pipeline_X.transform(self.__imputeVals(test_X)) preds = self.lm.predict(piped_X).reshape(-1,1) return self.__invert(preds)
class DeepAR(Model): def __init__(self, train_ds_all: ListDataset, model=None, transform: str = 'none', predictor=None): ''' Parameters ---- train_ds_all: a special ListDataset instance for training, as defined in MXNet's GluonTS package. ListDataset contains player_dict dictonaries for each player, as defined in Model.py model: optional pre-existing UNTRAINED model. predictor: optional pre-existing TRAINED model. transform: transform being applied during pre-/post-processing for ALL ARIMA models. Specify as string. Currently supports 'yj' and 'log' transforms ''' super().__init__() self.data_train = train_ds_all self.estimator = model self.predictor = predictor #Add hparams self.transform = transform if self.estimator is None else None self.power_transformer = PowerTransformer( ) if self.estimator is None else None def create( self, data_train, #ListDataset containing training data + metadata #NOTE: provides: 'feat_dynamic_cat', 'feat_dynamic_real', 'feat_static_cat', 'name','start','target' save_path, #Save location use_exog_feat=False, #Whether or not to use the exogenous features for modelling num_epochs=50, #Number of epochs to train lr=1e-3, #Learning rate batch_size=64, #Batch size scaling=False, #Boolean indicating whether to scale data or not context_length=3, #Number of samples to roll out LSTM/GRU num_layers=3, #Number of RNN layers embedding_dimension=16, #Dimension of embeddings layer context='cpu', #GPU/CPU training setting prediction_length=82, #Forecast horizong cardinality=None, #Number of values in each categorical feature (inferred if None) lags_seq=None, #Indices of the lagged target values to use as inputs of the RNN dropout_rate=0.1, #Dropout rate num_cells=40, #Number of cells in model cell_type='lstm', #Type (LSTM or GRU) num_parallel_samples=100 ): #Number of parallel predictions to sample from learnt distribution ''' Creates and a model for ALL the players in the training dataset Parameters ---- As defined above Returns ---- estimator: a DeepAREstimator instance ready to be trained ''' self.data_train = data_train freq = data_train.list_data[0][ 'freq'] #Use metadata for arbitrary player to get frequency, since always same trainer = Trainer(batch_size=batch_size, epochs=num_epochs, learning_rate=lr, ctx=context, hybridize=False) estimator = DeepAREstimator( freq=freq, prediction_length=prediction_length, scaling=scaling, context_length=context_length, num_layers=num_layers, embedding_dimension=embedding_dimension, trainer=trainer, use_feat_dynamic_real=True if use_exog_feat else False, use_feat_static_cat=False, use_feat_static_real=False, cardinality=cardinality, lags_seq=lags_seq, dropout_rate=dropout_rate, num_cells=num_cells, cell_type=cell_type, num_parallel_samples=num_parallel_samples) self.estimator = estimator return estimator def preprocess(self, player_train_labels ): #self.power_transformer, transform, stand, scale ''' Helper method to preprocess data for a SINGLE player Parameters ---- player_train_labels: labels for a single player's training Returns ---- preprocessed labels for training ''' #By definition, only one col in df try: assert np.array(player_train_labels).shape[1] == 1 except: logging.warn(f'Horizontal list?') assert np.array(player_train_labels).reshape(-1, 1) == len( np.array(player_train_labels)) player_train_labels = np.array(player_train_labels).reshape(-1, 1) if self.transform == 'log': # TODO: make this stat agnostic player_train_labels.iloc[:, 0] = np.log(player_train_labels.iloc[:, 0]) elif self.transform == 'yj': transformer = self.power_transformer transformer.fit(player_train_labels.iloc[:, 0].values.reshape(-1, 1)) player_train_labels.iloc[:, 0] = transformer.transform( player_train_labels.iloc[:, 0].values.reshape(-1, 1)) return player_train_labels def fit(self): ''' Parameters ----- None Returns ----- predictor: trained model ''' self.predictor = self.estimator.train(self.data_train) def predict(self, num_per=None, return_conf_int=True): ''' Parameters ----- num_per: unused, since constant return_conf_int: unused, since always True Returns ----- pred_generator: predictions generator ''' pred_generator = self.predictor.predict(self.data_train) return pred_generator #TODO: add boolPredictInsample option def postprocess(self, targets=None, predictions=None, intervals=None): #TODO: clean up #TODO: postprocess for scale/stand ''' Helper method to postprocess data for a SINGLE player Parameters ---- targets: labels for a single player's training intervals: see above predictions: see above Returns ---- post-processed versions of each of the above ''' for val in [targets, predictions]: if val is not None: #Reshape valiction vectors val = np.array(val).reshape(-1, 1) if len(np.array(val).shape) > 2: val = np.array(val)[0] #Transform if self.transform == 'log': val = np.exp(val) elif self.transform == 'yj': val = self.power_transformer.inverse_transform( val.reshape(-1, 1)) if intervals is not None: #Reshape array of prediction confidence intervals intervals = np.array(intervals).reshape(-1, 2) if len(np.array(intervals).shape) > 3: intervals = np.array(intervals)[0] #Transform and decompose if self.transform == 'yj': low_intervals = self.power_transformer.inverse_transform( intervals[:, 0].reshape(-1, 1)) high_intervals = self.power_transformer.inverse_transform( intervals[:, 1].reshape(-1, 1)) else: if self.transform == 'log': intervals = np.exp(intervals) else: pass #Decompose into lower and upper bounds low_intervals = [] high_intervals = [] for low, high in intervals: low_intervals.append(low) high_intervals.append(high) return targets, predictions, low_intervals, high_intervals return targets, predictions, intervals def process_prediction(self, prediction): ''' Processes predictions for all players ''' mean = prediction.mean_ts mean = mean.reset_index() mean = mean.rename(columns={0: 'predictions'}) mean = mean.rename(columns={'index': 'date'}) mean = mean.drop(columns=['date']) mean['gameNumber'] = mean.index + 1 conf = pd.DataFrame() conf.loc[:, 'low'] = prediction.quantile('0.05') conf.loc[:, 'high'] = prediction.quantile('0.95') full_df = pd.concat([mean, conf], axis=1) return full_df def generate_prediction_df(self, predictions, data, drop=True, target='cumStatpoints', scaled=None, scaling_loc=None): ''' Postprocess predictions for ALL players and return as df ''' if scaled is not None: scaling_meta = pd.read_pickle(scaling_loc) print(scaling_meta) names = data.loc[:, 'name'].unique() full_predictions = pd.DataFrame() for prediction, name in zip( predictions, names ): #ONE FORECAST OF LENGTH prediction_length PER PLAYER, in order of data['name'] player_df = pd.DataFrame() player_data = data.loc[data.loc[:, 'name'] == name].loc[:, [ 'name', 'gameNumber', target ]] #DF OF 'name', 'date', 'cumStatpoints' for ONE PLAYER data_length = player_data.shape[0] prediction_df = self.process_prediction(prediction) if drop: prediction_df = prediction_df.iloc[: data_length, :] #Drop excess predictions if no data available for evaluation player_data.reset_index(drop=True, inplace=True) prediction_df.reset_index(drop=True, inplace=True) if scaled == 'ss': scale_data = scaling_meta.loc[scaling_meta.loc[:, 'name'] == name] for column in ['predictions', 'low', 'high']: prediction_df.loc[:, column] = ((prediction_df.loc[:, column] * scale_data['maxabs']) \ * scale_data['std']) + scale_data['mean'] elif scaled == 'unit': scale_data = scaling_meta.loc[scaling_meta.loc[:, 'name'] == name] for column in ['predictions', 'low', 'high']: prediction_df.loc[:, column] = ( prediction_df.loc[:, column] - scale_data['min'].values) / scale_data['scale'].values player_data_df = pd.concat([player_data, prediction_df], axis=1) full_predictions = pd.concat([full_predictions, player_data_df]) return full_predictions #NOTE: Not possible to implement at this point. See presentation for details def update(self, new_data_ds): pass #NOTE: not impelemented because update not possibel def evaluate(self, test_ds_all: ListDataset, horizon: int = 0): pass
class PreprocessData: def __init__(self, preprocess_type=None, extend_data=False, short_end=False): self.config = Config() # prepare input data config_path = self.config.get_filepath("", "config.yaml") config_file = open(config_path, 'r') yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader) self.training_dataset_names = [ d['name'] for d in yaml_config['training_datasets'] ] self.training_dataset_start_pos = [ d['start_position'] for d in yaml_config['training_datasets'] ] self.test_dataset_names = [ d['name'] for d in yaml_config['test_datasets'] ] self.test_dataset_start_pos = [ d['start_position'] for d in yaml_config['test_datasets'] ] self.dataset_names = np.concatenate( (self.training_dataset_names, self.test_dataset_names)) # do we need these? self.dataset_start_pos = np.concatenate( (self.training_dataset_start_pos, self.test_dataset_start_pos)) # do we need these? # read in all pickle files self.all_pd = [] for dataset_name in self.dataset_names: self.all_pd.append( pd.read_pickle(self.config.get_filepath_data(dataset_name))) if extend_data: training_dataset_names_copy = np.array(self.training_dataset_names, copy=True) # create a copy of the data shifted up by 10 for i, dataset_name in enumerate(training_dataset_names_copy): self.dataset_names = np.append(self.dataset_names, dataset_name + "_" + str(10)) self.training_dataset_names = np.append( self.training_dataset_names, dataset_name + "_" + str(10)) self.dataset_start_pos = np.append( self.dataset_start_pos, self.training_dataset_start_pos[i]) self.training_dataset_start_pos.append( self.training_dataset_start_pos[i]) self.all_pd.append(self.all_pd[i].copy() + 10) self.dict_datasets = dict( zip(self.dataset_names, np.arange(len(self.dataset_names)))) self.enable_difference = False self._feature_range = [0, 1] self.normalisation_scalers = [] for _ in self.dataset_names: self.normalisation_scalers.append( MinMaxScaler(feature_range=self.feature_range)) self.enable_normalisation_scaler = False self.enable_ignore_price = False # scale each curve to feature_range self.power_transformer = PowerTransformer() self.enable_power_transform = False self.standardisation_scalers = [] for _ in self.dataset_names: self.standardisation_scalers.append(StandardScaler()) self.enable_standardisation_scaler = False self.enable_log_returns = False self.mult_factor = 10 # 5 self.add_factor = 25 # 6 self.enable_log = False self.enable_pct_change = False self.enable_curve_smoothing = False self.short_end = short_end # now setup PreprocessType settings if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS: self.enable_normalisation_scaler = True self.feature_range = [0, 1] elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES: self.enable_normalisation_scaler = True self.feature_range = [0, 1] self.enable_ignore_price = True elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS: self.enable_standardisation_scaler = True elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS: self.enable_log_returns = True @property def feature_range(self): # implements the get - this name is *the* name return self._feature_range @feature_range.setter def feature_range(self, value): # name must be the same self._feature_range = value for i, _ in enumerate(self.dataset_names): self.normalisation_scalers[i] = MinMaxScaler(feature_range=value) def get_data(self, training_dataset_names=None, test_dataset_names=None, chunks_of=None): if training_dataset_names is None: training_dataset_names = self.training_dataset_names if isinstance(training_dataset_names, str): training_dataset_names = np.array([training_dataset_names]) if test_dataset_names is None: test_dataset_names = self.test_dataset_names if test_dataset_names is None and self.test_dataset_names is None: test_dataset_names = [] if isinstance(test_dataset_names, str): test_dataset_names = np.array([test_dataset_names]) training_data = [] test_data = [] training_data_scaled = [] test_data_scaled = [] for key, value in self.dict_datasets.items(): start_position = self.dataset_start_pos[value] end_position = None if chunks_of is not None: end_position = chunks_of * ( (self.all_pd[value].shape[0] - start_position) // chunks_of) if key in training_dataset_names: # we take the log returns of each data set and scale wrt first dataset new_training_data = self.all_pd[value].copy( )[start_position:end_position] if self.short_end: new_training_data = new_training_data.iloc[:, 0] new_training_data_scaled = self.scale_data( new_training_data, value, True) training_data.append(new_training_data) training_data_scaled.append(new_training_data_scaled) if key in test_dataset_names: new_test_data = self.all_pd[value].copy( )[start_position:end_position] if self.short_end: new_test_data = new_test_data.iloc[:, 0] new_test_data_scaled = self.scale_data( new_test_data, value, True) # todo: should we scale test data wrt training data? test_data.append(new_test_data) test_data_scaled.append(new_test_data_scaled) maturities = self.all_pd[0].columns.values / (30 * 12) # for years if test_dataset_names is not None: return training_data, test_data, training_data_scaled, test_data_scaled, training_dataset_names, test_dataset_names, maturities else: return training_data_scaled, maturities # def rescale_data_inputter(self, data, datasets=None): # rescaled_data = [] # if datasets == "train": # for i, name in enumerate(self.training_dataset_names): # # pos = self.dict_datasets[name] # rescaled_data.append(self.rescale_data(data[i], dataset_name=name)) # # elif datasets == "test": # for i, name in enumerate(self.test_dataset_names): # # pos = self.dict_datasets[name] # # self.scale_data(self, data, dataset_num=pos) # rescaled_data.append(self.rescale_data(data[i], dataset_name=name)) # # return rescaled_data def scale_data(self, data, dataset_name=None, should_fit=False): # if given a numpy array, convert it to a dataframe first if type(data) is np.ndarray: _data = pd.DataFrame(data=data) elif isinstance(data, list): _data_list = [] # if isinstance(dataset_name, list): for _data, _dataset_name in zip(data, dataset_name): _data_list.append( self.scale_data(_data, _dataset_name, should_fit)) # else: # for _data in data: # _data_list.append(self.scale_data(_data, should_fit, dataset_name)) return _data_list else: _data = data.copy() time = _data.axes[0].tolist() # maturities = _data.columns.values dataset_num = 999 if dataset_name is not None: if isinstance(dataset_name, numbers.Integral): dataset_num = dataset_name else: for key, value in self.dict_datasets.items(): if key == dataset_name: dataset_num = value if self.enable_log: _data = _data.apply(np.log) if self.enable_difference: _data = _data.diff(axis=1) _data = _data.fillna(0) if self.enable_pct_change: _data = _data.pct_change() _data = _data.fillna(0) if self.enable_log_returns: shift = (_data.shift(0) + self.add_factor) / ( _data.shift(1) + self.add_factor ) # add 6 to make it non-negative, to take the log later shift = shift.dropna() if not (np.array(shift) > 0).all(): # some values are non-positive... this will break the log print("NON-POSITIVE VALUES FOUND, CANNOT PASS THROUGH LOG!!") print(np.min(_data)) print(shift) _data = self.mult_factor * np.log(shift) time = _data.axes[0].tolist() # now use only numpy, convert pandas to numpy array _data = _data.values if self.short_end and len(_data.shape) == 1: _data = _data.reshape(-1, 1) if self.enable_standardisation_scaler: if not self.enable_ignore_price: if should_fit: self.standardisation_scalers[dataset_num].fit(_data) _data = self.standardisation_scalers[dataset_num].transform( _data) else: data_temp = [] for row in _data: # row_as_2d = row.reshape(1, -1) row_as_column = row[:, np.newaxis] self.standardisation_scalers[dataset_num].fit( row_as_column) temp = self.standardisation_scalers[dataset_num].transform( row_as_column) data_temp.append(temp.ravel()) _data = np.array(data_temp) if self.enable_normalisation_scaler: if not self.enable_ignore_price: if should_fit: self.normalisation_scalers[dataset_num].fit(_data) _data = self.normalisation_scalers[dataset_num].transform( _data) else: data_temp = [] for row in _data: # row_as_2d = row.reshape(1, -1) row_as_column = row[:, np.newaxis] self.normalisation_scalers[dataset_num].fit(row_as_column) temp = self.normalisation_scalers[dataset_num].transform( row_as_column) data_temp.append(temp.ravel()) _data = np.array(data_temp) if self.enable_power_transform: if should_fit: self.power_transformer.fit(_data) _data = self.power_transformer.transform(_data) df = pd.DataFrame(data=_data, index=np.array(time)) return df def rescale_data(self, data, dataset_name=None, start_value=None, index=None, columns=None): if isinstance(data, pd.DataFrame): if columns is None: columns = data.columns.values if index is None: index = data.index.values if type(data) is np.ndarray: temp_data = data else: temp_data = np.array(data) if self.short_end and len(temp_data.shape) == 1: temp_data = temp_data.reshape(-1, 1) dataset_num = 999 if dataset_name is not None: for key, value in self.dict_datasets.items(): if key == dataset_name: dataset_num = value if self.enable_difference: temp_data = temp_data # TODO: inverse difference if self.enable_power_transform: temp_data = self.power_transformer.inverse_transform(temp_data) if self.enable_normalisation_scaler: # we need to scale each rolling window manually if self.enable_ignore_price: # rescale each curve individually data_min = self.all_pd[dataset_num].min(axis=1) data_max = self.all_pd[dataset_num].max(axis=1) a = self.feature_range[0] b = self.feature_range[1] for i in np.arange(temp_data.shape[0]): temp_data[i] = ( (temp_data[i] - a) / (b - a)) * (data_max[i] - data_min[i]) + data_min[i] else: if len(temp_data.shape) == 3: new_temp_data = [] for i in np.arange(temp_data.shape[0]): new_temp_data.append( self.normalisation_scalers[dataset_num]. inverse_transform(temp_data[i])) temp_data = np.array(new_temp_data) else: temp_data = self.normalisation_scalers[ dataset_num].inverse_transform(temp_data) if self.enable_standardisation_scaler: # temp_data = self.standardisation_scaler.inverse_transform(temp_data) if self.enable_ignore_price: raise NotImplementedError else: if len(temp_data.shape) == 3: new_temp_data = [] for i in np.arange(temp_data.shape[0]): new_temp_data.append( self.standardisation_scalers[dataset_num]. inverse_transform(temp_data[i])) temp_data = np.array(new_temp_data) else: temp_data = self.standardisation_scalers[ dataset_num].inverse_transform(temp_data) if self.enable_log: temp_data = np.exp(temp_data) if self.enable_log_returns: # if start_value is not assigned but dataset_name is, use the first value of the dataset as start_value if dataset_name is not None and start_value is None: _start_value = self.all_pd[dataset_num].iloc[0] elif start_value is not None: _start_value = start_value else: _start_value = 1. # print("shapes, log-return rescale", temp_data.shape, _start_value.shape, _start_value[0].shape) if len(temp_data.shape) is 1: z = np.exp(temp_data / self.mult_factor) z = np.insert( np.array(z), 0, _start_value[0] + self.add_factor) # instead of the usual _start_value temp_data = np.cumprod(z) - self.add_factor temp_data = pd.DataFrame(data=temp_data, index=self.all_pd[dataset_num].index) # print(temp_data.head(10)) elif len( temp_data.shape ) is 2: # when taking log-returns on an individual batch, todo: check if self.short_end: z = np.exp(temp_data / self.mult_factor) z = np.insert(z, 0, _start_value[0] + self.add_factor, axis=0) temp_data = np.cumprod(z, axis=0) - self.add_factor else: z = np.exp(temp_data / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor, axis=0) temp_data = np.cumprod(z, axis=0) - self.add_factor elif len(temp_data.shape ) > 2: # when taking log-returns on multiple batches z = np.exp(temp_data[:, :] / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor, axis=1) temp_data = np.cumprod(z, axis=1) - self.add_factor else: z = np.exp(temp_data[0, :] / self.mult_factor) z = np.insert(z, 0, _start_value + self.add_factor) temp_data = np.cumprod(z) - self.add_factor # print("log returns undo...", _start_value, temp_data[0]) if self.enable_curve_smoothing: curve_smooth = [] for curve in temp_data: curve_smooth.append(savgol_filter( curve, 23, 5)) # window size 51, polynomial order 3 temp_data = np.array(curve_smooth) if index is not None and columns is not None: return pd.DataFrame(temp_data, index=index, columns=columns) else: return temp_data
random_state=randomstate) clf.fit(X_train, y_train) # save classifier for further use dump(clf, clfpath) print("Training complete...") # clf = load(clfpath) # VALIDATION SET # load validation data validationfeatures = pd.read_csv( "/media/yannick/c4a7e8d3-9ac5-463f-b6e6-92e216ae6ac0/BRATS/BraTS2020/validationfeat_normalized.csv", index_col="ID") y_pred_validation_tmp = clf.predict(validationfeatures) y_pred_validation = np.squeeze( ptfm.inverse_transform(y_pred_validation_tmp.reshape(-1, 1))) pred_validation_df = pd.DataFrame(data=zip(validationfeatures.index.values, y_pred_validation), columns=["ID", "Prediction"]) pred_validation_df.to_csv(os.path.join( outpath, "validationprediction_powertfm_FINAL.csv"), header=False, index=False) # TESTING SET # load test data testfeatures = pd.read_csv( "/media/yannick/c4a7e8d3-9ac5-463f-b6e6-92e216ae6ac0/BRATS/BraTS2020/testingfeat_normalized_NEW.csv", index_col="BraTS20ID") y_pred_test_tmp = clf.predict(testfeatures)
def create_predictions_df(df, kmeans, knn): # create 2018 & 2019 masked DataFrame predictions = df[(df.year == 2018) | (df.year == 2019)] # create DataFrame for total units over 2018-2019 total_units = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_units.sum()) # create DataFrame for total buildings over 2018-2019 total_bldgs = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_bldgs.sum()) # create DataFrame for total value over 2018-2019 total_value = pd.DataFrame(predictions.groupby(["city", "state"]).total_high_density_value.sum()) # merging total_units to predictions predictions = predictions.merge(total_units, how="left", on=["city", "state"], suffixes=("_og", "_1819")) # merging total_bldgs to predictions predictions = predictions.merge(total_bldgs, how="left", on=["city", "state"], suffixes=("_og", "_1819")) # merging total_values to predictions predictions = predictions.merge(total_value, how="left", on=["city", "state"], suffixes=("_og", "_1819")) # 2018-2019 total units and buildings needed to calculate the proper weighted average predictions = predictions.groupby("city_state")[["total_high_density_units_1819", "total_high_density_bldgs_1819"]].mean() # masking initial df variable for last two years # grouping by city_state to get 130 unique observations # calc mean for ei, total buildings, and total valuation avgs = df[(df.year == 2018) | (df.year == 2019)].groupby("city_state")[["ei_x", "total_high_density_bldgs", "total_high_density_value"]].mean() # predictions["avg_units_per_bldg"] = predictions["total_high_density_units_1819"] / predictions["total_high_density_bldgs_1819"] # predictions.drop(columns="total_high_density_units_1819", inplace=True) # calc weighted average number of units per building over 2018-2019 avgs["avg_units_per_bldg"] = predictions["total_high_density_units_1819"] / predictions["total_high_density_bldgs_1819"] # create object scaler = PowerTransformer() # fit object scaler.fit(avgs[["avg_units_per_bldg", "ei_x"]]) # transform using object avgs[["avg_units_per_bldg", "ei_x"]] = scaler.transform(avgs[["avg_units_per_bldg", "ei_x"]]) # define features for KMeans modeling X = avgs[["avg_units_per_bldg", "ei_x"]] avgs["cluster"] = kmeans.predict(X) avgs[["avg_units_per_bldg", "ei_x"]] = scaler.inverse_transform(avgs[["avg_units_per_bldg", "ei_x"]]) scaler, avgs_scaled = min_max_scaler_prediction(avgs) avgs["label"] = knn.predict(avgs_scaled) city = avgs.reset_index().city_state.str.split("_", n=1, expand=True)[0] state = avgs.reset_index().city_state.str.split("_", n=1, expand=True)[1] avgs = avgs.reset_index() avgs["city"] = city avgs["state"] = state df_best = ( avgs[(avgs.label) & ((avgs.cluster == 0) | (avgs.cluster == 4))] ) df_high_density = ( avgs[(avgs.label) & ((avgs.cluster == 5) | (avgs.cluster == 2))] ) df_stable_high_markets = ( avgs[(avgs.label) & ((avgs.cluster == 3) | (avgs.cluster == 1))] ) df_best["recommendation_label"] = "Best_ROI" df_high_density["recommendation_label"] = "medium_ROI" df_stable_high_markets["recommendation_label"] = "Stable_High" avgs["recommendation_label"] = np.nan avgs.recommendation_label = avgs.recommendation_label.fillna(df_best.recommendation_label) avgs.recommendation_label = avgs.recommendation_label.fillna(df_high_density.recommendation_label) avgs.recommendation_label = avgs.recommendation_label.fillna(df_stable_high_markets.recommendation_label) avgs.recommendation_label = avgs.recommendation_label.fillna("Not Recommended to Enter") return avgs
class DFPowerTransformer(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = PowerTransformer(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis # Highly skewed: -1 > Skewness > 1 # Moderate skewed: -0.5 < Skewness < -1 # 0.5 < Skewness < 1 # Approximately symmetric: -0.5 < Skewness < 0.5 skew_df = X[self.transform_cols].skew().to_frame(name='Skewness') # Normal distributed kurtosis: 3 kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis') self.stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.transform( X[self.transform_cols]) # Transformed skewness & kurtosis skew_df = new_X[self.transform_cols].skew().to_frame( name='Skewness (Transformed)') kurt_df = new_X[self.transform_cols].kurt().to_frame( name='Kurtosis (Transformed)') stat_df = skew_df.merge(kurt_df, left_index=True, right_index=True, how='left') self.stat_df = self.stat_df.merge(stat_df, left_index=True, right_index=True, how='left') return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.copy() new_X[self.transform_cols] = self.model.inverse_transform( X[self.transform_cols]) return new_X
print(cross_val_score(elastic, X_train_sc, y_train_pt[:,0], cv = 5).mean()) # model fitting and evaluation: ridge.fit(X_train_sc, y_train_pt); print('ridge score on training set:', ridge.score(X_train_sc, y_train_pt)) print('ridge score on test set: ', ridge.score(X_test_sc, y_test_pt)) # predicting: ridge_pred = ridge.predict(X_test_sc) plt.hist(ridge_pred); plt.title('ridge predictions, based on log-transformation'.title()); # to go back to originals: # The .reshape(-1,1) method changes a numpy array into a numpy matrix with 1 column ridge_pred_reversed = pt_y.inverse_transform(ridge_pred.reshape(-1,1)) plt.hist(ridge_pred_reversed); plt.title('ridge predictions, back to original values'.title()); print('ridge score on target: ', r2_score(y_test, ridge_pred_reversed)) resid = y_test_pt - ridge_pred plt.hist(resid); plt.title('errors distribution of ridge prediction'.title()); test_data_sc = ss.transform(test_data) saleprice = ridge.predict(test_data_sc) plt.hist(saleprice); #after rescaling plt.title('sale prices after log transformation'.title());