Beispiel #1
0
                                    hist=True,
                                    boxSize=size,
                                    fileName='01_INITIAL')
# %%
# Display Boxplot.To check on outliers on numrical data which has been scaled. File output is at \output\02_OUTLIER_BoxPlot.png
analyser.histogramOrBoxPlotAnalysis(dfFullData,
                                    strCols=True,
                                    hist=False,
                                    boxSize=size,
                                    fileName='02_OUTLIER')
# %%
# 3. Data Pre Process
# 3.1 Remove unnecessary features
# Drop cells which is not useful in classification
dropColslist = ['id', 'Unnamed: 32']
df_drop_idUnamed = dataManager.dropUnnecessaryColumns(dfFullData, dropColslist)
df_drop_idUnamed.info()
# %%
# 3.2 Check duplicated records
# Check for duplicates. There are no duplicates
duplicateRowsDF = df_drop_idUnamed[df_drop_idUnamed.duplicated()]
duplicateRowsDF

#%%
# 3.3 Check for skewed data and try to normalize records
# Check for skewed data in numerical data and process skewed data to normalize it
dfskew = dataManager.checkSkew(df_drop_idUnamed)
dfskew.to_excel(output + "DF_Skew.xlsx")
dfskew
# Display Histogram.To check general data distibution on numrical data after unskew. File output is at \output\04_SKEW_DistPlot.png
analyser.histogramOrBoxPlotAnalysis(dfskew,