# 3. Data Pre Process # 3.1 Remove unnecessary features # Drop cells which is not useful in classification dropColslist = ['id', 'Unnamed: 32'] df_drop_idUnamed = dataManager.dropUnnecessaryColumns(dfFullData, dropColslist) df_drop_idUnamed.info() # %% # 3.2 Check duplicated records # Check for duplicates. There are no duplicates duplicateRowsDF = df_drop_idUnamed[df_drop_idUnamed.duplicated()] duplicateRowsDF #%% # 3.3 Check for skewed data and try to normalize records # Check for skewed data in numerical data and process skewed data to normalize it dfskew = dataManager.checkSkew(df_drop_idUnamed) dfskew.to_excel(output + "DF_Skew.xlsx") dfskew # Display Histogram.To check general data distibution on numrical data after unskew. File output is at \output\04_SKEW_DistPlot.png analyser.histogramOrBoxPlotAnalysis(dfskew, strCols=False, hist=True, boxSize=size, fileName='04_SKEW') #%% # 3.4 Apply encoding on dataset. # OHE applied on categorical data which has more the 2 values # LE applied on categorical data which has the 2 values df_le = dataManager.applyEncodingToNonNumericData(dfskew) #Display One hot encoding table