Beispiel #1
0
df_Scale_le = dataManager.scaleData(df_le)
# Display one hot encoded data which has been scaled
df_Scale_le.to_excel(output + "DF_Scale_LE.xlsx")
df_Scale_le
# Display Histogram.To check general data distibution on all data after  scaling.File output is at \output\06_SCALE_DistPlot.png
analyser.histogramOrBoxPlotAnalysis(df_Scale_le,
                                    strCols=True,
                                    hist=True,
                                    boxSize=size,
                                    fileName='06_SCALE')

#%%
# 3.6 Remove outlier data.
# Remove outliers on dataframes
# Base on boxplot, there are outliers in data frame
df_noOutlier_ohe = dataManager.removeOutlier(df_Scale_le)
df_noOutlier_ohe.to_excel(output + "DF_NoOutlier_LE.xlsx")
#%%
# Display Histogram.To check general data distibution on all data
# after  outlier removed.File output is at \output\
analyser.histogramOrBoxPlotAnalysis(df_noOutlier_ohe,
                                    strCols=True,
                                    hist=True,
                                    boxSize=size,
                                    fileName='07_NO_OUTLIER')
#%%
# 3.7 Analyse Correlation between features and remove highly correlated featuers.
#Correlation between features with athreshold pf 90%
df_corr_ohe = dataManager.showCorr(df_noOutlier_ohe, 0.90)
df_corr_ohe
#%%