Example #1
0
# 3. Data Pre Process
# 3.1 Remove unnecessary features
# Drop cells which is not useful in classification
dropColslist = ['id', 'Unnamed: 32']
df_drop_idUnamed = dataManager.dropUnnecessaryColumns(dfFullData, dropColslist)
df_drop_idUnamed.info()
# %%
# 3.2 Check duplicated records
# Check for duplicates. There are no duplicates
duplicateRowsDF = df_drop_idUnamed[df_drop_idUnamed.duplicated()]
duplicateRowsDF

#%%
# 3.3 Check for skewed data and try to normalize records
# Check for skewed data in numerical data and process skewed data to normalize it
dfskew = dataManager.checkSkew(df_drop_idUnamed)
dfskew.to_excel(output + "DF_Skew.xlsx")
dfskew
# Display Histogram.To check general data distibution on numrical data after unskew. File output is at \output\04_SKEW_DistPlot.png
analyser.histogramOrBoxPlotAnalysis(dfskew,
                                    strCols=False,
                                    hist=True,
                                    boxSize=size,
                                    fileName='04_SKEW')

#%%
# 3.4 Apply encoding on dataset.
# OHE applied on categorical data which has more the 2 values
# LE applied on categorical data which has the 2 values
df_le = dataManager.applyEncodingToNonNumericData(dfskew)
#Display One hot encoding table