def stationarity_analysis(X, item_analysis_res): res = pd.DataFrame( [], columns=[ 'kpss(unit_sales)', 'kpss(boxcox(unit_sales))', 'kpss(unit_sales.diff(1))', 'kpss(unit_sales.diff(365))', 'kpss(unit_sales.diff(1).diff(365))', 'normal_test(unit_sales)', 'shapiro_test(unit_sales)', 'normal_test(boxcox(unit_sales))', 'shapiro_test(boxcox(unit sales))', 'normal_test(boxcox(resid))', 'shapiro_test(boxcox(resid))' ]) for k in item_analysis_res.keys(): item, _ = item_analysis_res[k] x = X.loc[X['item_nbr'] == item, ['date', 'unit_sales']] x.index = x['date'] x = x.drop(['date'], axis=1) x_bc, x_bc_params = boxcox(x['unit_sales']) daily = 365 decompose_x = seasonal_decompose(x['unit_sales'], model='additive', freq=daily) alpha = 0.05 kpss_test = 'stationary' if kpss(x['unit_sales'], regression='ct')[1] > alpha else '-' kpss_test_bc = 'stationary' if kpss( boxcox(x['unit_sales'])[0], regression='ct')[1] > alpha else '-' kpss_test_diff1 = 'stationary' if kpss( x['unit_sales'].diff(1), regression='ct')[1] > alpha else '-' kpss_test_diff365 = 'stationary' if kpss( x['unit_sales'].diff(365), regression='ct')[1] > alpha else '-' kpss_test_diff1_365 = 'stationary' if kpss( x['unit_sales'].diff(365), regression='ct')[1] > alpha else '-' norm_test = 'normal' if normaltest( x['unit_sales'])[1] >= alpha else '-' shapiro_test = 'normal' if shapiro( x['unit_sales'])[1] >= alpha else '-' norm_test_bc = 'normal' if normaltest(boxcox( x['unit_sales'])[0])[1] >= alpha else '-' shapiro_test_bc = 'normal' if shapiro(boxcox( x['unit_sales'])[0])[1] >= alpha else '-' norm_test_resid_bc = 'normal' if normaltest( yeojohnson(decompose_x.resid.dropna())[0])[1] >= alpha else '-' shapiro_test_resid_bc = 'normal' if shapiro( yeojohnson(decompose_x.resid.dropna())[0])[1] >= alpha else '-' res.loc[k, :] = (kpss_test, kpss_test_bc, kpss_test_diff1, kpss_test_diff365, kpss_test_diff1_365, norm_test, shapiro_test, norm_test_bc, shapiro_test_bc, norm_test_resid_bc, shapiro_test_resid_bc) return res
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learns the optimal lambda for the Yeo-Johnson transformation. Args: X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: It is not needed in this transformer. Defaults to None. Alternatively takes Pandas Series. Returns: self """ # check input dataframe X = super().fit(X) self.lambda_dict_ = {} # to avoid NumPy error X[self.variables] = X[self.variables].astype("float") for var in self.variables: _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the numerical variables. Captures the optimal lambda for the transformation. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just seleted variables. y : None y is not needed in this transformer, yet the sklearn pipeline API requires this parameter for checking. You can either leave it as None or pass y. """ super().fit(X, y) self.lambda_dict_ = {} for var in self.variables: X[var] = X[var].astype('float') # to avoid NumPy error _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the optimal lambda for the Yeo-Johnson transformation. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y : None y is not needed in this transformer. You can pass y or None. """ # check input dataframe X = super().fit(X) self.lambda_dict_ = {} # to avoid NumPy error X[self.variables] = X[self.variables].astype('float') for var in self.variables: _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) self.input_shape_ = X.shape return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply the Yeo-Johnson transformation. Parameters ---------- X: Pandas DataFrame of shape = [n_samples, n_features] The data to be transformed. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the variable(s) contain null values - If the df has different number of features than the df used in fit() Returns ------- X: pandas dataframe The dataframe with the transformed variables. """ # check input dataframe and if class was fitted X = super().transform(X) for feature in self.variables_: X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature]) return X
def transform(self, X): """ Applies the BoxCox trasformation. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe with the transformed variables. """ # Check is fit had been called check_is_fitted(self, ['input_shape_', 'lambda_dict_']) if X.shape[1] != self.input_shape_[1]: raise ValueError('Number of columns in dataset is different from training set used to fit the transformer') X = X.copy() for feature in self.variables: X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature]) return X
def gen_ts(self): """ generate data for trend changes returns the new ts, season_ts, trend_ts, level_ts, change times for each trend and level, model param ts for each trend and level """ season_ = self.seasonal_data() event_ = self.event_data() trend_, self.trend_tbreak_, self.trend_thetat_, self.trend_theta_ = self._gen_ts( 'trend', self.trend_cpt, self.trend_noise_level, self.trend_slow_bleed) level_, self.level_tbreak_, self.level_thetat_, self.level_theta_ = self._gen_ts( 'level', self.level_cpt, self.level_noise_level, self.level_slow_bleed) if self.t_size > 0: # mixture lambdas = np.random.uniform(low=-2.0, high=2.0, size=self.t_size) print('TS mixture:: lambdas: ' + str(lambdas)) elif self.t_size == 0: # addtive lambdas = [1.0] print('TS is additive:: lambdas: ' + str(lambdas)) else: # mutiplicative lambdas = np.random.uniform(low=1.5, high=3.0, size=int(np.abs(self.t_size))) print('TS is multiplicative:: lambdas: ' + str(lambdas)) # additive components self.event = 0 if event_ is None else event_ self.season = 0 if season_ is None else season_ self.trend = 0 if trend_ is None else trend_ self.level = 0 if level_ is None else level_ self.ts = self.event + self.season + self.trend + self.level for lb in lambdas: self.ts = sps.yeojohnson(self.ts, lmbda=lb)
def stabilize_variance(df): if min(df) > 0: print("Using Box-Cox Power Transformation...") return boxcox(df) else: print("Using Yeo-Johnson Power Transformation...") return yeojohnson(df)
def transform(self, X: dt.Frame): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) | np.array(XX <= -self._offset) if not any(~is_na) or self._lmbda is None: return X ret = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda) # apply transform with pre-computed lambda XX[~is_na] = ret return XX
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.lams = {} for col in X.columns: Xcol_float = X[col].astype(float) _, lam = yeojohnson(Xcol_float) self.lams[col] = lam return self
def test_transformed_corr(self): service_time = self.data["Service time"] absenteeism_time = yeojohnson( self.data["Absenteeism time in hours"].apply(float))[0] pearson_corr = pearsonr(service_time, absenteeism_time) self.assertAlmostEqual(pearson_corr[0], -0.042, places=2) self.assertAlmostEqual(pearson_corr[1], 0.25, places=2)
def yeojohnson_transform(self): l = self.numeric_features to_numeric_features = [] for i in l: self.data[f"{i}_yeo"] = yeojohnson(self.data[i])[0] to_numeric_features.append(f"{i}_yeo") self.transformed_numeric_features += to_numeric_features
def test_actual_results_power_transformer_yeo_johnson(): """Test that the actual results are the expected ones.""" for standardize in [True, False]: pt = PowerTransformer(method='yeo-johnson', standardize=standardize) arr_actual = pt.fit_transform(X) arr_desired = [yeojohnson(X[i].astype('float64'))[0] for i in range(3)] if standardize: arr_desired = StandardScaler().transform(arr_desired) np.testing.assert_allclose(arr_actual, arr_desired, atol=1e-5, rtol=0.)
def trans(self): #Remove Dupliactes df = self.dataset.loc[:, ~self.dataset.columns.duplicated()] #Remove Nan Values remove_nan = self.dataset.replace(np.nan, 0) #Check For The DataTypes check_datatypes = remove_nan.dtypes #To Get The Column Date Column Number and Name Total_cloumns = remove_nan.columns if 'Date' or 'Timestamp' in remove_nan.columns: column_num = remove_nan.columns.get_loc("Date" or "Timestamp") else: print("no") date_column = Total_cloumns[column_num] #To Drop Date Column drop = remove_nan.drop([date_column], axis=1) #To Describe Entire Dataset describe = drop.describe() #To Pre=Process The Dataset centered_scaled_data = preprocessing.scale(drop) #To Convert 1d-array dfconvert_array = drop.to_numpy() y = sum(dfconvert_array.tolist(), []) #Check If their are any negative or zero values neg_count = len(list(filter(lambda x: (x < 0), y))) pos_count = len(list(filter(lambda x: (x > 0), y))) zero_count = len(list(filter(lambda x: (x <= 0), y))) #Plot Before Yeo-Johnson Transformation fig = plt.figure() ax1 = fig.add_subplot(211) prob = stats.probplot(y, dist=stats.norm, plot=ax1) ax1.set_xlabel('') ax1.set_title('Probplot against normal distribution') #Plot After Yeo-Johnson Transformation fig = plt.figure() ax2 = fig.add_subplot(212) xt, lmbda = stats.yeojohnson(y) prob = stats.probplot(xt, dist=stats.norm, plot=ax2) ax2.set_title('Probplot after Yeo-Johnson transformation') #Skewness Before Transformation skewness_before = skew(drop) #Skewness After Transformation skewness_after = skew(xt) #Standardization names = drop.columns scaler = preprocessing.StandardScaler() scaled_df = scaler.fit_transform(drop) scaled_df = pd.DataFrame(scaled_df, columns=names) #Normalization x = drop.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) return (df, remove_nan, check_datatypes, drop, describe, centered_scaled_data, lmbda, skewness_before, skewness_after, df)
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix new_col_list.append(new_col) Xcol_float = X[col].astype(float) X[new_col] = yeojohnson(Xcol_float, lmbda=self.lams[col]) return X.loc[:, new_col_list]
def fit_transform(self, X: dt.Frame, y: np.array = None): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) self._offset = -np.nanmin(XX) if np.nanmin(XX) < 0 else 0 self._offset += 1e-3 self._lmbda = None if not any(~is_na): return X self._lmbda = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda)[1] # compute lambda return self.transform(X)
def _make_normal(self): """Normalize a list using Yeo-Johnson.""" if self.lofl == False: is_normal = self.normal_test(self.lst, self.alpha) if is_normal == False: lst, self.lmbda = stats.yeojohnson(self.lst) self.lst = lst.tolist() else: return if self.lofl == True: lst, self.lmbda = stats.yeojohnson(self.Make_Long(self.lst)) lst = lst.tolist() self.lst = Capability.Brake_Down(lst=lst, n=self.n) # normalised_lofl = [] # for sublist in self.lst: # sublist, _ = stats.yeojohnson(sublist) # normalised_lofl.append(list(sublist)) # self.lst = normalised_lofl if self.lsl is not None: lsl = stats.yeojohnson(self.lsl, self.lmbda).tolist() lsl = round(lsl, 3) if self.usl is not None: usl = stats.yeojohnson(self.usl, self.lmbda).tolist() usl = round(usl, 3) self.lsl = min(lsl, usl) self.usl = max(lsl, usl) if self.target is not None: self.target = stats.yeojohnson(self.target, self.lmbda).tolist() self.target = round(self.target, 3) try: (self.usl - self.lsl) / 2 except ValueError: self.midpoint = None pass else: self.midpoint = (self.usl + self.lsl) / 2 self.midpoint = round(self.midpoint, 3)
def yeoJohnson(obs: 'pd.Series') -> 'pd.Series': ''' Implement a Yeo Johnson Transformation of Data, it is ideal for data with negative values. Input: :param obs: sequential data to analyze Output: obs_bc: Data transformed lmbd: Optimal lambda ''' obs_yj, lmbd = yeojohnson(obs) return (obs_yj, lmbd)
def transform(self, X: dt.Frame): XX = X.to_pandas().iloc[:, 0].values is_na = np.isnan(XX) | np.array(XX <= -self._offset) if not any(~is_na) or self._lmbda is None: return X ret = yeojohnson( self._offset + XX[~is_na], lmbda=self._lmbda) # apply transform with pre-computed lambda XX[~is_na] = ret XX = dt.Frame(XX) # Don't leave inf/-inf for i in range(XX.ncols): XX.replace([math.inf, -math.inf], None) return XX
def normal_tests(data, alpha=0.05): yj = stats.yeojohnson(data)[0] sq = np.sqrt(data) cb = np.cbrt(data) log = np.log(data) return pd.DataFrame( { 'Skewness': [ stats.skew(data), stats.skew(sq), stats.skew(cb), stats.skew(log), stats.skew(yj) ], 'Skewness Test': [ skewness_test(data, alpha=alpha), skewness_test(sq, alpha=alpha), skewness_test(cb, alpha=alpha), skewness_test(log, alpha=alpha), skewness_test(yj, alpha=alpha) ], 'Kurtosis': [ stats.kurtosis(data), stats.kurtosis(sq), stats.kurtosis(cb), stats.kurtosis(log), stats.kurtosis(yj) ], 'Kurtosis Test': [ kurtosis_test(data, alpha=alpha), kurtosis_test(sq, alpha=alpha), kurtosis_test(cb, alpha=alpha), kurtosis_test(log, alpha=alpha), kurtosis_test(yj, alpha=alpha) ], 'Normal Test': [ dagostino_test(data, alpha=alpha), dagostino_test(sq, alpha=alpha), dagostino_test(cb, alpha=alpha), dagostino_test(log, alpha=alpha), dagostino_test(yj, alpha=alpha) ] }, index=['default', 'sqrt', 'cuberoot', 'log', 'yeojohnson'])
def transform_variable(var, transform_choice): if transform_choice == 1: if any(var < 0): return np.log(var + (np.abs(np.min(var)) + 1)) elif 0 in var: return np.log1p(var) else: return np.log(var) elif transform_choice == 2: if any(var < 0): return np.sqrt(var + np.abs(np.min(var))) else: return np.sqrt(var) elif transform_choice == 3: x, _ = stats.yeojohnson(var) return x else: raise ValueError('Transform option not supported')
def normalize(self, df, method="boxcox"): """ Normalizes each of the columns in the given dataframe according to the method Inputs: - df: dataframe with numeric columns - method: string specifying the normalization method Returns dataframe with normalized columns """ if method == "boxcox": for column in df.columns: df[column] = stats.boxcox(df[column])[0] elif method == "log": for column in df.columns: df[column] = np.log(df[column]) elif method == "yeojohnson": for column in df.columns: df[column] = stats.yeojohnson(df[column])[0] return df
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Applies the Yeo-Johnson transformation. Args: X: Pandas DataFrame of shape = [n_samples, n_features] The data to be transformed. Returns: The dataframe with the transformed variables. """ # check input dataframe and if class was fitted X = super().transform(X) for feature in self.variables: X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature]) return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the optimal lambda for the Yeo-Johnson transformation. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError - If there are no numerical variables in the df or the df is empty - If the variable(s) contain null values Returns ------- self """ # check input dataframe X = super().fit(X) self.lambda_dict_ = {} # to avoid NumPy error X[self.variables_] = X[self.variables_].astype("float") for var in self.variables_: _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) self.n_features_in_ = X.shape[1] return self
def transform(self, X): # yeo-johnson if self.yeojohnson: # apply learned Yeo-Johnson transformation for col in self.yj_lambdas_dict_.keys(): X[col + "_YeoJohnson"] = stats.yeojohnson( X[col].values, lmbda=self.yj_lambdas_dict_[col] ) # Box-Cox if self.boxcox: # apply learned Box-Cox transformation for col in self.bc_zero_lambdas_dict_.keys(): try: X[col + "_BoxCox"] = stats.boxcox( X[col].values + 1, lmbda=self.bc_zero_lambdas_dict_[col] ) except ValueError: X[col + "_BoxCox"] = 0. # apply learned Box-Cox transformation for col in self.bc_neg_lambdas_dict_.keys(): try: X[col + "_BoxCox"] = stats.boxcox( X[col].values + np.abs(np.min(X[col].values)) + 1, lmbda=self.bc_neg_lambdas_dict_[col] ) except ValueError: X[col + "_BoxCox"] = 0. # apply learned Box-Cox transformation for col in self.bc_lambdas_dict_.keys(): try: X[col + "_BoxCox"] = stats.boxcox( X[col].values, lmbda=self.bc_lambdas_dict_[col] ) except ValueError: X[col + "_BoxCox"] = 0. return X
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the optimal lambda for the Yeo-Johnson transformation. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = super()._fit_from_varlist(X) self.lambda_dict_ = {} for var in self.variables_: _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) return self
def transform_norm(data, type_transform="yeojohnson"): mu = 0 std = 0 new_data = [] if type_transform == "box-cox": #box-cox transformation print("Minimum value:", min(data)) shift = 0 minimum = min(data) if minimum < 0: shift = round(abs(minimum)) posdata = [x + shift for x in data] new_data, lmda = boxcox(posdata) elif type_transform == "yeojohnson": #Yeo-Johnson power transformation new_data, lmbda = yeojohnson(data) else: raise ValueError("No such transformation") mu, std = norm.fit(new_data) return new_data, mu, std
import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1.inset_locator import inset_axes np.random.seed(1245) # Generate some random variates and calculate Yeo-Johnson log-likelihood # values for them for a range of ``lmbda`` values: x = stats.loggamma.rvs(5, loc=10, size=1000) lmbdas = np.linspace(-2, 10) llf = np.zeros(lmbdas.shape, dtype=float) for ii, lmbda in enumerate(lmbdas): llf[ii] = stats.yeojohnson_llf(lmbda, x) # Also find the optimal lmbda value with `yeojohnson`: x_most_normal, lmbda_optimal = stats.yeojohnson(x) # Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a # horizontal line to check that that's really the optimum: fig = plt.figure() ax = fig.add_subplot(111) ax.plot(lmbdas, llf, 'b.-') ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r') ax.set_xlabel('lmbda parameter') ax.set_ylabel('Yeo-Johnson log-likelihood') # Now add some probability plots to show that where the log-likelihood is # maximized the data transformed with `yeojohnson` looks closest to normal: locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
from scipy import stats import matplotlib.pyplot as plt # Generate some non-normally distributed data, and create a Yeo-Johnson plot: x = stats.loggamma.rvs(5, size=500) + 5 fig = plt.figure() ax = fig.add_subplot(111) prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax) # Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in # the same plot: _, maxlog = stats.yeojohnson(x) ax.axvline(maxlog, color='r') plt.show()
def eda_numerical_variable(self, variable): ''' Parameter: variable: pass the variable for which EDA is required provides basic statistcs, missing values, distribution, spread statistics, Q-Q plot, Box plot, outliers using IQR, various variable transformations''' c = variable s = self.__df__[variable] # 1. Basic Statistics print ('Total Number of observations : ', len(s)) print () print ('Datatype :', (s.dtype)) print () printmd ('**<u>5 Point Summary :</u>**') print (' Minimum :\t\t', s.min(), '\n 25th Percentile :\t', s.quantile(0.25), '\n Median :\t\t', s.median(), '\n 75th Percentile :\t', s.quantile(0.75), '\n Maximum :\t\t', s.max()) print () # 2. Missing values printmd ('**<u>Missing Values :</u>**') print (' Number :', s.isnull().sum()) print (' Percentage :', s.isnull().mean()*100, '%') # 3. Histogram printmd ('**<u>Variable distribution and Spread statistics :</u>**') sns.distplot(s.dropna(), hist = True, fit = norm, kde = True) plt.show() # 4. Spread Statistics print ('Skewness :' , s.skew()) print ('Kurtosis :', s.kurt()) print () # 5. Q-Q plot printmd ('**<u>Normality Check :</u>**') res = stats.probplot(s.dropna(), dist = 'norm', plot = plt) plt.show() # 6. Box plot to check the spread outliers print () printmd ('**<u>Box Plot and Visual check for Outlier :</u>**') sns.boxplot(s.dropna(), orient = 'v') plt.show() # 7. Get outliers. Here distance could be a user defined parameter which defaults to 1.5 print () printmd ('**<u>Outliers (using IQR):</u>**') IQR = np.quantile(s, .75) - np.quantile(s, .25) upper_boundary = np.quantile(s, .75) + 1.5 * IQR lower_boundary = np.quantile(s, .25) - 1.5 * IQR print (' Right end outliers :', np.sum(s>upper_boundary)) print (' Left end outliers :', np.sum(s < lower_boundary)) # 8. Various Variable Transformations print () printmd (f'**<u>Explore various transformations for {c}</u>**') print () print ('1. Logarithmic Transformation') s_log = np.log(s) normality_diagnostic(s_log) print ('2. Exponential Transformation') s_exp = np.exp(s) normality_diagnostic(s_exp) print ('3. Square Transformation') s_sqr = np.square(s) normality_diagnostic(s_sqr) print ('4. Square-root Transformation') s_sqrt = np.sqrt(s) normality_diagnostic(s_sqrt) print ('5. Box-Cox Transformation') s_boxcox, lambda_param = stats.boxcox(s) normality_diagnostic(s_boxcox) print ('Optimal Lambda for Box-Cox transformation is :', lambda_param ) print () print ('6. Yeo Johnson Transformation') s = s.astype('float') s_yeojohnson, lambda_param = stats.yeojohnson(s) normality_diagnostic(s_yeojohnson) print ('Optimal Lambda for Yeo Johnson transformation is :', lambda_param ) print ()