def stationarity_analysis(X, item_analysis_res):
    res = pd.DataFrame(
        [],
        columns=[
            'kpss(unit_sales)', 'kpss(boxcox(unit_sales))',
            'kpss(unit_sales.diff(1))', 'kpss(unit_sales.diff(365))',
            'kpss(unit_sales.diff(1).diff(365))', 'normal_test(unit_sales)',
            'shapiro_test(unit_sales)', 'normal_test(boxcox(unit_sales))',
            'shapiro_test(boxcox(unit sales))', 'normal_test(boxcox(resid))',
            'shapiro_test(boxcox(resid))'
        ])

    for k in item_analysis_res.keys():
        item, _ = item_analysis_res[k]

        x = X.loc[X['item_nbr'] == item, ['date', 'unit_sales']]

        x.index = x['date']
        x = x.drop(['date'], axis=1)

        x_bc, x_bc_params = boxcox(x['unit_sales'])

        daily = 365
        decompose_x = seasonal_decompose(x['unit_sales'],
                                         model='additive',
                                         freq=daily)

        alpha = 0.05
        kpss_test = 'stationary' if kpss(x['unit_sales'],
                                         regression='ct')[1] > alpha else '-'
        kpss_test_bc = 'stationary' if kpss(
            boxcox(x['unit_sales'])[0], regression='ct')[1] > alpha else '-'
        kpss_test_diff1 = 'stationary' if kpss(
            x['unit_sales'].diff(1), regression='ct')[1] > alpha else '-'
        kpss_test_diff365 = 'stationary' if kpss(
            x['unit_sales'].diff(365), regression='ct')[1] > alpha else '-'
        kpss_test_diff1_365 = 'stationary' if kpss(
            x['unit_sales'].diff(365), regression='ct')[1] > alpha else '-'
        norm_test = 'normal' if normaltest(
            x['unit_sales'])[1] >= alpha else '-'
        shapiro_test = 'normal' if shapiro(
            x['unit_sales'])[1] >= alpha else '-'
        norm_test_bc = 'normal' if normaltest(boxcox(
            x['unit_sales'])[0])[1] >= alpha else '-'
        shapiro_test_bc = 'normal' if shapiro(boxcox(
            x['unit_sales'])[0])[1] >= alpha else '-'
        norm_test_resid_bc = 'normal' if normaltest(
            yeojohnson(decompose_x.resid.dropna())[0])[1] >= alpha else '-'
        shapiro_test_resid_bc = 'normal' if shapiro(
            yeojohnson(decompose_x.resid.dropna())[0])[1] >= alpha else '-'

        res.loc[k, :] = (kpss_test, kpss_test_bc, kpss_test_diff1,
                         kpss_test_diff365, kpss_test_diff1_365, norm_test,
                         shapiro_test, norm_test_bc, shapiro_test_bc,
                         norm_test_resid_bc, shapiro_test_resid_bc)

    return res
Esempio n. 2
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learns the optimal lambda for the Yeo-Johnson transformation.

        Args:
            X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to transform.

            y: It is not needed in this transformer. Defaults to None.
            Alternatively takes Pandas Series.

        Returns:
            self
        """

        # check input dataframe
        X = super().fit(X)

        self.lambda_dict_ = {}

        # to avoid NumPy error
        X[self.variables] = X[self.variables].astype("float")

        for var in self.variables:
            _, self.lambda_dict_[var] = stats.yeojohnson(X[var])

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Learns the numerical variables. Captures the optimal lambda for
        the transformation.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just seleted variables.
        y : None
            y is not needed in this transformer, yet the sklearn pipeline API
            requires this parameter for checking. You can either leave it as None
            or pass y.
        """

        super().fit(X, y)
        
        self.lambda_dict_ = {}
        
        for var in self.variables:
        	X[var] = X[var].astype('float') # to avoid NumPy error
        	_, self.lambda_dict_[var] = stats.yeojohnson(X[var])
            
        self.input_shape_ = X.shape
        
        return self
Esempio n. 4
0
    def fit(self, X, y=None):
        """
        Learns the optimal lambda for the Yeo-Johnson transformation.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to transform.
        y : None
            y is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = super().fit(X)

        self.lambda_dict_ = {}

        # to avoid NumPy error
        X[self.variables] = X[self.variables].astype('float')

        for var in self.variables:
            _, self.lambda_dict_[var] = stats.yeojohnson(X[var])

        self.input_shape_ = X.shape

        return self
Esempio n. 5
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply the Yeo-Johnson transformation.

        Parameters
        ----------
        X: Pandas DataFrame of shape = [n_samples, n_features]
            The data to be transformed.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            - If the variable(s) contain null values
            - If the df has different number of features than the df used in fit()

        Returns
        -------
        X: pandas dataframe
            The dataframe with the transformed variables.
        """

        # check input dataframe and if class was fitted

        X = super().transform(X)
        for feature in self.variables_:
            X[feature] = stats.yeojohnson(X[feature],
                                          lmbda=self.lambda_dict_[feature])

        return X
    def transform(self, X):
        """
        Applies the BoxCox trasformation.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        
        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe with the transformed variables.
        """
        
        # Check is fit had been called
        check_is_fitted(self, ['input_shape_', 'lambda_dict_'])
        
        if X.shape[1] != self.input_shape_[1]:
            raise ValueError('Number of columns in dataset is different from training set used to fit the transformer')

        X = X.copy()
        for feature in self.variables:
            X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature]) 
            
        return X
Esempio n. 7
0
    def gen_ts(self):
        """
        generate data for trend changes
        returns the new ts, season_ts, trend_ts, level_ts, change times for each trend and level, model param ts for each trend and level
        """
        season_ = self.seasonal_data()
        event_ = self.event_data()
        trend_, self.trend_tbreak_, self.trend_thetat_, self.trend_theta_ = self._gen_ts(
            'trend', self.trend_cpt, self.trend_noise_level,
            self.trend_slow_bleed)
        level_, self.level_tbreak_, self.level_thetat_, self.level_theta_ = self._gen_ts(
            'level', self.level_cpt, self.level_noise_level,
            self.level_slow_bleed)
        if self.t_size > 0:  # mixture
            lambdas = np.random.uniform(low=-2.0, high=2.0, size=self.t_size)
            print('TS mixture:: lambdas: ' + str(lambdas))
        elif self.t_size == 0:  # addtive
            lambdas = [1.0]
            print('TS is additive:: lambdas: ' + str(lambdas))
        else:  # mutiplicative
            lambdas = np.random.uniform(low=1.5,
                                        high=3.0,
                                        size=int(np.abs(self.t_size)))
            print('TS is multiplicative:: lambdas: ' + str(lambdas))

        # additive components
        self.event = 0 if event_ is None else event_
        self.season = 0 if season_ is None else season_
        self.trend = 0 if trend_ is None else trend_
        self.level = 0 if level_ is None else level_
        self.ts = self.event + self.season + self.trend + self.level
        for lb in lambdas:
            self.ts = sps.yeojohnson(self.ts, lmbda=lb)
Esempio n. 8
0
def stabilize_variance(df):
    if min(df) > 0:
        print("Using Box-Cox Power Transformation...")
        return boxcox(df)
    else:
        print("Using Yeo-Johnson Power Transformation...")
        return yeojohnson(df)
Esempio n. 9
0
 def transform(self, X: dt.Frame):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX) | np.array(XX <= -self._offset)
     if not any(~is_na) or self._lmbda is None:
         return X
     ret = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda)  # apply transform with pre-computed lambda
     XX[~is_na] = ret
     return XX
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.lams = {}
     for col in X.columns:
         Xcol_float = X[col].astype(float)
         _, lam = yeojohnson(Xcol_float)
         self.lams[col] = lam
     return self
    def test_transformed_corr(self):
        service_time = self.data["Service time"]
        absenteeism_time = yeojohnson(
            self.data["Absenteeism time in hours"].apply(float))[0]

        pearson_corr = pearsonr(service_time, absenteeism_time)
        self.assertAlmostEqual(pearson_corr[0], -0.042, places=2)
        self.assertAlmostEqual(pearson_corr[1], 0.25, places=2)
Esempio n. 12
0
    def yeojohnson_transform(self):
        l = self.numeric_features
        to_numeric_features = []
        for i in l:
            self.data[f"{i}_yeo"] = yeojohnson(self.data[i])[0]
            to_numeric_features.append(f"{i}_yeo")

        self.transformed_numeric_features += to_numeric_features
Esempio n. 13
0
def test_actual_results_power_transformer_yeo_johnson():
    """Test that the actual results are the expected ones."""
    for standardize in [True, False]:
        pt = PowerTransformer(method='yeo-johnson', standardize=standardize)
        arr_actual = pt.fit_transform(X)
        arr_desired = [yeojohnson(X[i].astype('float64'))[0] for i in range(3)]
        if standardize:
            arr_desired = StandardScaler().transform(arr_desired)
        np.testing.assert_allclose(arr_actual, arr_desired, atol=1e-5, rtol=0.)
Esempio n. 14
0
 def trans(self):
     #Remove Dupliactes
     df = self.dataset.loc[:, ~self.dataset.columns.duplicated()]
     #Remove Nan Values
     remove_nan = self.dataset.replace(np.nan, 0)
     #Check For The DataTypes
     check_datatypes = remove_nan.dtypes
     #To Get The Column Date Column Number and Name
     Total_cloumns = remove_nan.columns
     if 'Date' or 'Timestamp' in remove_nan.columns:
         column_num = remove_nan.columns.get_loc("Date" or "Timestamp")
     else:
         print("no")
     date_column = Total_cloumns[column_num]
     #To Drop Date Column
     drop = remove_nan.drop([date_column], axis=1)
     #To Describe Entire Dataset
     describe = drop.describe()
     #To Pre=Process The Dataset
     centered_scaled_data = preprocessing.scale(drop)
     #To Convert 1d-array
     dfconvert_array = drop.to_numpy()
     y = sum(dfconvert_array.tolist(), [])
     #Check If their are any negative or zero values
     neg_count = len(list(filter(lambda x: (x < 0), y)))
     pos_count = len(list(filter(lambda x: (x > 0), y)))
     zero_count = len(list(filter(lambda x: (x <= 0), y)))
     #Plot Before Yeo-Johnson Transformation
     fig = plt.figure()
     ax1 = fig.add_subplot(211)
     prob = stats.probplot(y, dist=stats.norm, plot=ax1)
     ax1.set_xlabel('')
     ax1.set_title('Probplot against normal distribution')
     #Plot After Yeo-Johnson Transformation
     fig = plt.figure()
     ax2 = fig.add_subplot(212)
     xt, lmbda = stats.yeojohnson(y)
     prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
     ax2.set_title('Probplot after Yeo-Johnson transformation')
     #Skewness Before Transformation
     skewness_before = skew(drop)
     #Skewness After Transformation
     skewness_after = skew(xt)
     #Standardization
     names = drop.columns
     scaler = preprocessing.StandardScaler()
     scaled_df = scaler.fit_transform(drop)
     scaled_df = pd.DataFrame(scaled_df, columns=names)
     #Normalization
     x = drop.values
     min_max_scaler = preprocessing.MinMaxScaler()
     x_scaled = min_max_scaler.fit_transform(x)
     df = pd.DataFrame(x_scaled)
     return (df, remove_nan, check_datatypes, drop, describe,
             centered_scaled_data, lmbda, skewness_before, skewness_after,
             df)
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     X = X.copy()
     new_col_list = []
     for col in X.columns:
         new_col = self.prefix + col + self.suffix
         new_col_list.append(new_col)
         Xcol_float = X[col].astype(float)
         X[new_col] = yeojohnson(Xcol_float, lmbda=self.lams[col])
     return X.loc[:, new_col_list]
Esempio n. 16
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX)
     self._offset = -np.nanmin(XX) if np.nanmin(XX) < 0 else 0
     self._offset += 1e-3
     self._lmbda = None
     if not any(~is_na):
         return X
     self._lmbda = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda)[1]  # compute lambda
     return self.transform(X)
Esempio n. 17
0
    def _make_normal(self):
        """Normalize a list using Yeo-Johnson."""
        if self.lofl == False:
            is_normal = self.normal_test(self.lst, self.alpha)
            if is_normal == False:
                lst, self.lmbda = stats.yeojohnson(self.lst)
                self.lst = lst.tolist()
            else:
                return
        if self.lofl == True:
            lst, self.lmbda = stats.yeojohnson(self.Make_Long(self.lst))
            lst = lst.tolist()
            self.lst = Capability.Brake_Down(lst=lst, n=self.n)
            # normalised_lofl = []
            # for sublist in self.lst:
            #     sublist, _ = stats.yeojohnson(sublist)
            #     normalised_lofl.append(list(sublist))
            # self.lst = normalised_lofl

        if self.lsl is not None:
            lsl = stats.yeojohnson(self.lsl, self.lmbda).tolist()
            lsl = round(lsl, 3)
        if self.usl is not None:
            usl = stats.yeojohnson(self.usl, self.lmbda).tolist()
            usl = round(usl, 3)
        self.lsl = min(lsl, usl)
        self.usl = max(lsl, usl)
        if self.target is not None:
            self.target = stats.yeojohnson(self.target, self.lmbda).tolist()
            self.target = round(self.target, 3)
        try:
            (self.usl - self.lsl) / 2
        except ValueError:
            self.midpoint = None
            pass
        else:
            self.midpoint = (self.usl + self.lsl) / 2
            self.midpoint = round(self.midpoint, 3)
Esempio n. 18
0
    def yeoJohnson(obs: 'pd.Series') -> 'pd.Series':
        '''
           Implement a Yeo Johnson Transformation of Data,
           it is ideal for data with negative values. 
	   
           Input:
           :param obs: sequential data to analyze
	   
           Output:
           obs_bc: Data transformed
           lmbd: Optimal lambda
        '''
        obs_yj, lmbd = yeojohnson(obs)
        return (obs_yj, lmbd)
 def transform(self, X: dt.Frame):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX) | np.array(XX <= -self._offset)
     if not any(~is_na) or self._lmbda is None:
         return X
     ret = yeojohnson(
         self._offset + XX[~is_na],
         lmbda=self._lmbda)  # apply transform with pre-computed lambda
     XX[~is_na] = ret
     XX = dt.Frame(XX)
     # Don't leave inf/-inf
     for i in range(XX.ncols):
         XX.replace([math.inf, -math.inf], None)
     return XX
Esempio n. 20
0
def normal_tests(data, alpha=0.05):

    yj = stats.yeojohnson(data)[0]
    sq = np.sqrt(data)
    cb = np.cbrt(data)
    log = np.log(data)

    return pd.DataFrame(
        {
            'Skewness': [
                stats.skew(data),
                stats.skew(sq),
                stats.skew(cb),
                stats.skew(log),
                stats.skew(yj)
            ],
            'Skewness Test': [
                skewness_test(data, alpha=alpha),
                skewness_test(sq, alpha=alpha),
                skewness_test(cb, alpha=alpha),
                skewness_test(log, alpha=alpha),
                skewness_test(yj, alpha=alpha)
            ],
            'Kurtosis': [
                stats.kurtosis(data),
                stats.kurtosis(sq),
                stats.kurtosis(cb),
                stats.kurtosis(log),
                stats.kurtosis(yj)
            ],
            'Kurtosis Test': [
                kurtosis_test(data, alpha=alpha),
                kurtosis_test(sq, alpha=alpha),
                kurtosis_test(cb, alpha=alpha),
                kurtosis_test(log, alpha=alpha),
                kurtosis_test(yj, alpha=alpha)
            ],
            'Normal Test': [
                dagostino_test(data, alpha=alpha),
                dagostino_test(sq, alpha=alpha),
                dagostino_test(cb, alpha=alpha),
                dagostino_test(log, alpha=alpha),
                dagostino_test(yj, alpha=alpha)
            ]
        },
        index=['default', 'sqrt', 'cuberoot', 'log', 'yeojohnson'])
Esempio n. 21
0
def transform_variable(var, transform_choice):
    if transform_choice == 1:
        if any(var < 0):
            return np.log(var + (np.abs(np.min(var)) + 1))
        elif 0 in var:
            return np.log1p(var)
        else:
            return np.log(var)
    elif transform_choice == 2:
        if any(var < 0):
            return np.sqrt(var + np.abs(np.min(var)))
        else:
            return np.sqrt(var)
    elif transform_choice == 3:
        x, _ = stats.yeojohnson(var)
        return x
    else:
        raise ValueError('Transform option not supported')
Esempio n. 22
0
 def normalize(self, df, method="boxcox"):
     """
     Normalizes each of the columns in the given dataframe according to the method
     
     Inputs:
     - df: dataframe with numeric columns
     - method: string specifying the normalization method
     
     Returns dataframe with normalized columns
     """
     if method == "boxcox":
         for column in df.columns:
             df[column] = stats.boxcox(df[column])[0]
     elif method == "log":
         for column in df.columns:
             df[column] = np.log(df[column])
     elif method == "yeojohnson":
         for column in df.columns:
             df[column] = stats.yeojohnson(df[column])[0]
     return df
Esempio n. 23
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Applies the Yeo-Johnson transformation.

        Args:
            X: Pandas DataFrame of shape = [n_samples, n_features]
            The data to be transformed.

        Returns:
            The dataframe with the transformed variables.
        """

        # check input dataframe and if class was fitted

        X = super().transform(X)
        for feature in self.variables:
            X[feature] = stats.yeojohnson(X[feature],
                                          lmbda=self.lambda_dict_[feature])

        return X
Esempio n. 24
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the optimal lambda for the Yeo-Johnson transformation.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.

        Raises
        ------
         TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            - If there are no numerical variables in the df or the df is empty
            - If the variable(s) contain null values

        Returns
        -------
        self
        """

        # check input dataframe
        X = super().fit(X)

        self.lambda_dict_ = {}

        # to avoid NumPy error
        X[self.variables_] = X[self.variables_].astype("float")

        for var in self.variables_:
            _, self.lambda_dict_[var] = stats.yeojohnson(X[var])

        self.n_features_in_ = X.shape[1]

        return self
Esempio n. 25
0
    def transform(self, X):
        # yeo-johnson
        if self.yeojohnson:

            # apply learned Yeo-Johnson transformation
            for col in self.yj_lambdas_dict_.keys():
                X[col + "_YeoJohnson"] = stats.yeojohnson(
                    X[col].values, lmbda=self.yj_lambdas_dict_[col]
                )

        # Box-Cox
        if self.boxcox:

            # apply learned Box-Cox transformation
            for col in self.bc_zero_lambdas_dict_.keys():
                try:
                    X[col + "_BoxCox"] = stats.boxcox(
                        X[col].values + 1, lmbda=self.bc_zero_lambdas_dict_[col]
                    )
                except ValueError:
                    X[col + "_BoxCox"] = 0.

            # apply learned Box-Cox transformation
            for col in self.bc_neg_lambdas_dict_.keys():
                try:
                    X[col + "_BoxCox"] = stats.boxcox(
                        X[col].values + np.abs(np.min(X[col].values)) + 1, lmbda=self.bc_neg_lambdas_dict_[col]
                    )
                except ValueError:
                    X[col + "_BoxCox"] = 0.

            # apply learned Box-Cox transformation
            for col in self.bc_lambdas_dict_.keys():
                try:
                    X[col + "_BoxCox"] = stats.boxcox(
                        X[col].values, lmbda=self.bc_lambdas_dict_[col]
                    )
                except ValueError:
                    X[col + "_BoxCox"] = 0.
        return X
Esempio n. 26
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the optimal lambda for the Yeo-Johnson transformation.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = super()._fit_from_varlist(X)

        self.lambda_dict_ = {}

        for var in self.variables_:
            _, self.lambda_dict_[var] = stats.yeojohnson(X[var])

        return self
Esempio n. 27
0
def transform_norm(data, type_transform="yeojohnson"):
    mu = 0
    std = 0
    new_data = []
    if type_transform == "box-cox":
        #box-cox transformation
        print("Minimum value:", min(data))
        shift = 0
        minimum = min(data)
        if minimum < 0:
            shift = round(abs(minimum))

        posdata = [x + shift for x in data]
        new_data, lmda = boxcox(posdata)

    elif type_transform == "yeojohnson":
        #Yeo-Johnson power transformation
        new_data, lmbda = yeojohnson(data)
    else:
        raise ValueError("No such transformation")

    mu, std = norm.fit(new_data)

    return new_data, mu, std
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
np.random.seed(1245)

# Generate some random variates and calculate Yeo-Johnson log-likelihood
# values for them for a range of ``lmbda`` values:

x = stats.loggamma.rvs(5, loc=10, size=1000)
lmbdas = np.linspace(-2, 10)
llf = np.zeros(lmbdas.shape, dtype=float)
for ii, lmbda in enumerate(lmbdas):
    llf[ii] = stats.yeojohnson_llf(lmbda, x)

# Also find the optimal lmbda value with `yeojohnson`:

x_most_normal, lmbda_optimal = stats.yeojohnson(x)

# Plot the log-likelihood as function of lmbda.  Add the optimal lmbda as a
# horizontal line to check that that's really the optimum:

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(lmbdas, llf, 'b.-')
ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
ax.set_xlabel('lmbda parameter')
ax.set_ylabel('Yeo-Johnson log-likelihood')

# Now add some probability plots to show that where the log-likelihood is
# maximized the data transformed with `yeojohnson` looks closest to normal:

locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
from scipy import stats
import matplotlib.pyplot as plt

# Generate some non-normally distributed data, and create a Yeo-Johnson plot:

x = stats.loggamma.rvs(5, size=500) + 5
fig = plt.figure()
ax = fig.add_subplot(111)
prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)

# Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
# the same plot:

_, maxlog = stats.yeojohnson(x)
ax.axvline(maxlog, color='r')

plt.show()
Esempio n. 30
0
    def eda_numerical_variable(self, variable):
        '''
        Parameter:
            variable: pass the variable for which EDA is required
            
        provides basic statistcs, missing values, distribution, spread statistics, 
        Q-Q plot, Box plot, outliers using IQR, various variable transformations'''
        c = variable
        s = self.__df__[variable]

        
        # 1. Basic Statistics

        print ('Total Number of observations : ', len(s))
        print ()

        print ('Datatype :', (s.dtype))
        print ()

        printmd ('**<u>5 Point Summary :</u>**')

        print ('  Minimum  :\t\t', s.min(), '\n  25th Percentile :\t', s.quantile(0.25), 
               '\n  Median :\t\t', s.median(), '\n  75th Percentile :\t', s.quantile(0.75), 
               '\n  Maximum  :\t\t', s.max())

        print ()

        # 2. Missing values

        printmd ('**<u>Missing Values :</u>**')

        print ('  Number :', s.isnull().sum())
        print ('  Percentage :', s.isnull().mean()*100, '%')

        # 3. Histogram
        
        printmd ('**<u>Variable distribution and Spread statistics :</u>**')

        sns.distplot(s.dropna(), hist = True, fit = norm, kde = True)
        plt.show()

        # 4. Spread Statistics

        print ('Skewness :' , s.skew())
        print ('Kurtosis :', s.kurt())
        print ()

        # 5. Q-Q plot
        printmd ('**<u>Normality Check :</u>**')
        res = stats.probplot(s.dropna(), dist = 'norm', plot = plt)
        plt.show()

        # 6. Box plot to check the spread outliers
        print ()
        printmd ('**<u>Box Plot and Visual check for Outlier  :</u>**')
        sns.boxplot(s.dropna(), orient = 'v')
        plt.show()

        # 7. Get outliers. Here distance could be a user defined parameter which defaults to 1.5

        print ()
        printmd ('**<u>Outliers (using IQR):</u>**')

        IQR = np.quantile(s, .75) - np.quantile(s, .25)
        upper_boundary = np.quantile(s, .75) + 1.5 * IQR
        lower_boundary = np.quantile(s, .25) - 1.5 * IQR

        print ('  Right end outliers :', np.sum(s>upper_boundary))
        print ('  Left end outliers :', np.sum(s < lower_boundary))

        # 8. Various Variable Transformations

        print ()
        printmd (f'**<u>Explore various transformations for {c}</u>**')
        print ()

        print ('1. Logarithmic Transformation')
        s_log = np.log(s)
        normality_diagnostic(s_log)

        print ('2. Exponential Transformation')
        s_exp = np.exp(s)
        normality_diagnostic(s_exp)

        print ('3. Square Transformation')
        s_sqr = np.square(s)
        normality_diagnostic(s_sqr)

        print ('4. Square-root Transformation')
        s_sqrt = np.sqrt(s)
        normality_diagnostic(s_sqrt)

        print ('5. Box-Cox Transformation')
        s_boxcox, lambda_param = stats.boxcox(s)
        normality_diagnostic(s_boxcox)
        print ('Optimal Lambda for Box-Cox transformation is :', lambda_param )
        print ()

        print ('6. Yeo Johnson Transformation')
        s = s.astype('float')
        s_yeojohnson, lambda_param = stats.yeojohnson(s)
        normality_diagnostic(s_yeojohnson)
        print ('Optimal Lambda for Yeo Johnson transformation is :', lambda_param )
        print ()