Beispiel #1
0
# Remove outliers on dataframes
# Base on boxplot, there are outliers in data frame
df_noOutlier_ohe = dataManager.removeOutlier(df_Scale_le)
df_noOutlier_ohe.to_excel(output + "DF_NoOutlier_LE.xlsx")
#%%
# Display Histogram.To check general data distibution on all data
# after  outlier removed.File output is at \output\
analyser.histogramOrBoxPlotAnalysis(df_noOutlier_ohe,
                                    strCols=True,
                                    hist=True,
                                    boxSize=size,
                                    fileName='07_NO_OUTLIER')
#%%
# 3.7 Analyse Correlation between features and remove highly correlated featuers.
#Correlation between features with athreshold pf 90%
df_corr_ohe = dataManager.showCorr(df_noOutlier_ohe, 0.90)
df_corr_ohe
#%%
# Base on correlation table, some features  has
# high correlation.Will have to drop some of them before running model,

dropColslist = [
    'radius_mean', 'perimeter_mean', 'radius_worst', 'area_mean', 'radius_se',
    'area_worst', 'perimeter_se', 'concavity_mean', 'texture_mean',
    'concave points_mean'
]
df_Final_ohe = dataManager.dropUnnecessaryColumns(df_noOutlier_ohe,
                                                  dropColslist)
df_Final_ohe.to_excel(output + "DF_Final_LE.xlsx")
df_Final_ohe
#%%