def featureSelectionUsingExtraTreesClassifier(dataSetForFeatureSelection): print( "\n****** Start performing feature selection using ExtraTreesClassifier *****" ) print("****** Falls under wrapper methods (feature importance) *****") labelName = getLabelName() #Applying feature encoding before applying the ExtraTreesClassification dataSetForFeatureSelection = featureEncodingUsingLabelEncoder( dataSetForFeatureSelection) dataSetAfterFeatuerSelection = dataSetForFeatureSelection #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] features = dataSetForFeatureSelection.drop([labelName], axis=1) label = dataSetForFeatureSelection[labelName] labelencoder = LabelEncoder() labelTransformed = labelencoder.fit_transform(label) print("****** ExtraTreesClassification is in progress *****") #Train using ExtraTreesClassifier trainedforest = ExtraTreesClassifier(n_estimators=700).fit( features, labelTransformed) importances = trainedforest.feature_importances_ #array with importances of each feature idx = np.arange( 0, features.shape[1]) #create an index array, with the number of features features_to_keep = idx[importances > np.mean( importances )] #only keep features whose importance is greater than the mean importance featureImportances = pd.Series(importances, index=features.columns) selectedFeatures = featureImportances.nlargest(len(features_to_keep)) print("\n selectedFeatures after ExtraTreesClassification: ", selectedFeatures) print("****** Completed ExtraTreesClassification *****") #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k') #selectedFeatures.plot(kind='barh') selectedFeaturesNames = selectedFeatures.keys() dataSetForFeatureSelection = dataSetForFeatureSelection.drop( selectedFeaturesNames, axis=1) dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop( dataSetForFeatureSelection.columns, axis=1) dataSetAfterFeatuerSelection[labelName] = label numberOfFeaturesInTheDatasetAfterFeatureSelection = len( dataSetAfterFeatuerSelection.columns) print('\n***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) print( "****** End performing feature selection using ExtraTreesClassifier *****" ) return dataSetAfterFeatuerSelection
def splitCompleteDataSetIntoTrainingSetAndTestingSet(completeDataSet): labelName = getLabelName() label = completeDataSet[labelName] features = completeDataSet.drop(labelName, axis=1) featuresInPreProcessedTrainingDataSet, featuresInPreProcessedTestingDataSet, labelInPreProcessedTrainingDataSet, labelInPreProcessedTestingDataSet = train_test_split( features, label, test_size=0.4, random_state=42) print("features.shape: ", features.shape) print("label.shape: ", label.shape) return featuresInPreProcessedTrainingDataSet, featuresInPreProcessedTestingDataSet, labelInPreProcessedTrainingDataSet, labelInPreProcessedTestingDataSet
def featureEncodingUsingBinaryEncoder(dataSetForFeatureEncoding): print( "****** Start binary encoding on the categorical features in the given dataset *****" ) labelName = getLabelName() #Extract the categorical features, leave the label categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop( [labelName], axis=1).select_dtypes(['object']) #Get the names of the categorical features categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values print("****** Number of features before binary encoding: ", len(dataSetForFeatureEncoding.columns)) print("****** Number of categorical features in the dataset: ", len(categoricalColumnNames)) print("****** Categorical feature names in the dataset: ", categoricalColumnNames) print( '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n' ) label = dataSetForFeatureEncoding.drop( dataSetForFeatureEncoding. loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis=1) for feature in categoricalColumnNames: uniq = np.unique(dataSetForFeatureEncoding[feature]) print('\n{}: {} '.format(feature, len(uniq))) printList(dataSetForFeatureEncoding[feature].unique(), 'distinct values') featureColumns = dataSetForFeatureEncoding.drop( dataSetForFeatureEncoding. loc[:, ~dataSetForFeatureEncoding.columns.isin([feature])].columns, axis=1) binaryEncoder = ce.BinaryEncoder(cols=[feature]) binaryEncodedFeature = binaryEncoder.fit_transform( featureColumns, label) dataSetForFeatureEncoding = dataSetForFeatureEncoding.join( binaryEncodedFeature) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1) dataSetForFeatureEncoding[labelName] = label print("****** Number of features after binary encoding: ", len(dataSetForFeatureEncoding.columns)) print( "****** End binary encoding on the categorical features in the given dataset *****\n" ) return dataSetForFeatureEncoding
def featureEncodingUsingFrequencyEncoder(dataSetForFeatureEncoding): print( "****** Start frequency encoding on the categorical features in the given dataset *****" ) labelName = getLabelName() #Extract the categorical features, leave the label categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop( [labelName], axis=1).select_dtypes(['object']) #Get the names of the categorical features categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values print("****** Number of features before label encoding: ", len(dataSetForFeatureEncoding.columns)) print("****** Number of categorical features in the dataset: ", len(categoricalColumnNames)) print("****** Categorical feature names in the dataset: ", categoricalColumnNames) print( '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n' ) label = dataSetForFeatureEncoding.drop( dataSetForFeatureEncoding. loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis=1) for feature in categoricalColumnNames: uniq = np.unique(dataSetForFeatureEncoding[feature]) print('\n{}: {} '.format(feature, len(uniq))) printList(dataSetForFeatureEncoding[feature].unique(), 'distinct values') frequencyEncoder = dataSetForFeatureEncoding.groupby( feature).size() / len(dataSetForFeatureEncoding) dataSetForFeatureEncoding.loc[:, feature + "_Encoded"] = dataSetForFeatureEncoding[ feature].map(frequencyEncoder) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1) dataSetForFeatureEncoding[labelName] = label print("****** Number of features after frequency encoding: ", len(dataSetForFeatureEncoding.columns)) print( "****** End frequency encoding on the categorical features in the given dataset *****\n" ) return dataSetForFeatureEncoding
def featureEncodingUsingOneHotEncoder(dataSetForFeatureEncoding): print( "****** Start one hot encoding on the categorical features in the given dataset *****" ) labelName = getLabelName() #Extract the categorical features, leave the label categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop( [labelName], axis=1).select_dtypes(['object']) #Get the names of the categorical features categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values print("****** Number of features before one hot encoding: ", len(dataSetForFeatureEncoding.columns)) print("****** Number of categorical features in the dataset: ", len(categoricalColumnNames)) print("****** Categorical feature names in the dataset: ", categoricalColumnNames) print( '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n' ) categoricalFeaturesInTheDataset = list( set(dataSetForFeatureEncoding.columns) - set(dataSetForFeatureEncoding._get_numeric_data().columns)) numericalFeaturesInTheDataset = list( dataSetForFeatureEncoding._get_numeric_data().columns) for feature in categoricalFeaturesInTheDataset: uniq = np.unique(dataSetForFeatureEncoding[feature]) print('\n{}: {} '.format(feature, len(uniq))) printList(dataSetForFeatureEncoding[feature].unique(), 'distinct values') #Using get_dummies function to get the dummy variables for the categorical columns onHotEncodedDataSet = pd.get_dummies(dataSetForFeatureEncoding, columns=categoricalColumnNames, prefix=categoricalColumnNames) #Move the label column to the end label = onHotEncodedDataSet.pop(labelName) onHotEncodedDataSet[labelName] = label numberOfColumnsInOneHotEncodedDataset = len(onHotEncodedDataSet.columns) print("****** Number of features after one hot encoding: ", numberOfColumnsInOneHotEncodedDataset) print( "****** End one hot encoding on the categorical features in the given dataset *****\n" ) return onHotEncodedDataSet
def featureScalingUsingStandardScalar(dataSetForFeatureScaling): print( "****** Start feature scaling of the features present in the dataset using StandardScalar *****" ) numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns) dataSetInArrayFormat = dataSetForFeatureScaling.values #Remove the label column from the dataset labelName = getLabelName() label = dataSetForFeatureScaling.pop(labelName) print(dataSetInArrayFormat) features = dataSetInArrayFormat[:, 0:numberOfColumnsInEncodedDataset - 1] print( "\n****** Number of features in the dataset before performing scaling: ", np.size(features, 1)) print( "\n****** Features in the dataset before performing scaling ***** \n", features) #Perform feature scaling scaler = StandardScaler() scaledFeatures = scaler.fit_transform(features) print( "\n****** Number of features in the dataset after performing scaling: ", np.size(scaledFeatures, 1)) print("\n****** Features in the dataset after performing scaling ***** \n", scaledFeatures) #Convert from array format to dataframe scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns) scaledFeatures = scaledFeatures.reset_index(drop=True) label = label.reset_index(drop=True) scaledFeatures[labelName] = label print("scaledFeatures.head(): ", scaledFeatures.head()) print("scaledFeatures.shape: ", scaledFeatures.shape) print( "\n****** End of feature scaling of the features present in the dataset using StandardScalar *****\n" ) return scaledFeatures
def featureSelectionUsingChisquaredTest(dataSetForFeatureSelection): print( "\n****** Start performing feature selection using ChisquaredTest *****" ) print("****** Falls under filter methods (univariate selection) *****") numberOfFeatureToBeSelected = 10 labelName = getLabelName() #To be able to apply Chi-squared test dataSetForFeatureSelection = featureEncodingUsingLabelEncoder( dataSetForFeatureSelection) dataSetAfterFeatuerSelection = dataSetForFeatureSelection #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] features = dataSetForFeatureSelection.drop([labelName], axis=1) label = dataSetForFeatureSelection[labelName] #Apply SelectKBest class to extract top 10 best features bestfeatures = SelectKBest(score_func=chi2, k=numberOfFeatureToBeSelected) fitBestfeatures = bestfeatures.fit(features, label) columns = pd.DataFrame(features.columns) scores = pd.DataFrame(fitBestfeatures.scores_) #concat two dataframes for better visualization scoresOfBestFeatures = pd.concat([columns, scores], axis=1) scoresOfBestFeatures.columns = ['Features', 'Score'] print("\n***** Scores for each feature in the dataset are *****") print(scoresOfBestFeatures.nlargest(numberOfFeatureToBeSelected, 'Score')) mask = fitBestfeatures.get_support() for j in range(0, len(mask)): if (mask[j] == False): dataSetAfterFeatuerSelection.pop(features.columns[j]) numberOfFeaturesInTheDatasetAfterFeatureSelection = len( dataSetAfterFeatuerSelection.columns) print('***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) print("****** End performing feature selection using ChisquaredTest *****") return dataSetAfterFeatuerSelection
def featureSelectionUsingTheilU(dataSetForFeatureSelection): print("\n****** Start performing feature selection using TheilU *****") print( "****** Falls under the group of techniques that use correlation matrix with Heatmap *****" ) labelName = getLabelName() label = dataSetForFeatureSelection[labelName] theilu = pd.DataFrame(index=[labelName], columns=dataSetForFeatureSelection.columns) columns = dataSetForFeatureSelection.columns dataSetAfterFeatuerSelection = dataSetForFeatureSelection for j in range(0, len(columns)): u = theil_u(label.tolist(), dataSetForFeatureSelection[columns[j]].tolist()) theilu.loc[:, columns[j]] = u if u < 0.50: dataSetAfterFeatuerSelection.pop(columns[j]) print( '***** Ploting the uncertainty coefficient between the target and each feature *****' ) theilu.fillna(value=np.nan, inplace=True) plt.figure(figsize=(30, 1)) sns.heatmap(theilu, annot=True, fmt='.2f') plt.show() numberOfFeaturesInTheDatasetAfterFeatureSelection = len( dataSetAfterFeatuerSelection.columns) print('***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) print("****** End performing feature selection using TheilU *****") return dataSetAfterFeatuerSelection
def featureEncodingUsingLabelEncoder(dataSetForFeatureEncoding): print( "****** Start label encoding on the categorical features in the given dataset *****" ) labelName = getLabelName() #Extract the categorical features, leave the label categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop( [labelName], axis=1).select_dtypes(['object']) #Get the names of the categorical features categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values print("****** Number of features before label encoding: ", len(dataSetForFeatureEncoding.columns)) print("****** Number of categorical features in the dataset: ", len(categoricalColumnNames)) print("****** Categorical feature names in the dataset: ", categoricalColumnNames) print( '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n' ) labelEncoder = LabelEncoder() for feature in categoricalColumnNames: uniq = np.unique(dataSetForFeatureEncoding[feature]) print('\n{}: {} '.format(feature, len(uniq))) printList(dataSetForFeatureEncoding[feature].unique(), 'distinct values') dataSetForFeatureEncoding[feature] = labelEncoder.fit_transform( dataSetForFeatureEncoding[feature]) print("****** Number of features after label encoding: ", len(dataSetForFeatureEncoding.columns)) print( "****** End label encoding on the categorical features in the given dataset *****\n" ) return dataSetForFeatureEncoding
def performPreprocessingBuildModelsAndEvaluateAccuracy(trainingDataSet, testingDataSet, arrayOfModels): for i in range(1,len(arrayOfModels)): print('***************************************************************************************************************************') print('********************************************* Building Model-', i ,' As Below *************************************************') print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], ' \n\t -- Classification: \t ', arrayOfModels[i][3], '\n') trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) testingDataSet = loadCSV(testingFileNameWithAbsolutePath) labelName = getLabelName() label = trainingDataSet[labelName] #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets #the values in the categorical columns in test dataset and train dataset are being different this causes issues while #applying classification techniques completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) #difficultyLevel = completeDataSet.pop('difficulty_level') print("completeDataSet.shape: ",completeDataSet.shape) print("completeDataSet.head: ",completeDataSet.head()) #Feature Selection if arrayOfModels[i][0] == 'TheilsU': #Perform feature selection using TheilU completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) elif arrayOfModels[i][0] == 'Chi-SquaredTest': #Perform feature selection using Chi-squared Test completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) elif arrayOfModels[i][0] == 'RandomForestClassifier': #Perform feature selection using RandomForestClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) elif arrayOfModels[i][0] == 'ExtraTreesClassifier': #Perform feature selection using ExtraTreesClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) #Feature Encoding if arrayOfModels[i][1] == 'LabelEncoder': #Perform lable encoding to convert categorical values into label encoded features completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'OneHotEncoder': #Perform OnHot encoding to convert categorical values into one-hot encoded features completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'FrequencyEncoder': #Perform Frequency encoding to convert categorical values into frequency encoded features completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'BinaryEncoder': #Perform Binary encoding to convert categorical values into binary encoded features completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) #Feature Scaling if arrayOfModels[i][2] == 'Min-Max': #Perform MinMaxScaler to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Binarizing': #Perform Binarizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Normalizing': #Perform Normalizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Standardization': #Perform Standardization to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) #Split the complete dataSet into training dataSet and testing dataSet featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) #Classification if arrayOfModels[i][3] == 'DecisonTree': #Perform classification using DecisionTreeClassifier classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'RandomForestClassifier': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'ExtraTreesClassifier': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'LogisticRegressionRegression': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'LinearDiscriminantAnalysis': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'GuassianNaiveBayes': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) elif arrayOfModels[i][3] == 'KNN': classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) arrayOfModels[i].append(trainingAccuracyScore) arrayOfModels[i].append(testingAccuracyScore) modelName = arrayOfModels[i][0]+"_"+arrayOfModels[i][1]+"_"+arrayOfModels[i][2]+"_"+arrayOfModels[i][3] modelFileName = getPathToGenerateModels() + modelName+".pkl" arrayOfModels[i].append(modelName) arrayOfModels[i].append(modelFileName) #Save the model to file joblib.dump(classifier, modelFileName)
def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels): for i in range(0,len(arrayOfModels)): print('***************************************************************************************************************************') print('********************************************* Building Model-', i ,' As Below *************************************************') print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n') trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) testingDataSet = loadCSV(testingFileNameWithAbsolutePath) labelName = getLabelName() label = trainingDataSet[labelName] #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets #the values in the categorical columns in test dataset and train dataset are being different this causes issues while #applying classification techniques completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) #difficultyLevel = completeDataSet.pop('difficulty_level') print("completeDataSet.shape: ",completeDataSet.shape) print("completeDataSet.head: ",completeDataSet.head()) #Feature Selection if arrayOfModels[i][0] == 'TheilsU': #Perform feature selection using TheilU completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) elif arrayOfModels[i][0] == 'Chi-SquaredTest': #Perform feature selection using Chi-squared Test completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) elif arrayOfModels[i][0] == 'RandomForestClassifier': #Perform feature selection using RandomForestClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) elif arrayOfModels[i][0] == 'ExtraTreesClassifier': #Perform feature selection using ExtraTreesClassifier completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) #Feature Encoding if arrayOfModels[i][1] == 'LabelEncoder': #Perform lable encoding to convert categorical values into label encoded features completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'OneHotEncoder': #Perform OnHot encoding to convert categorical values into one-hot encoded features completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'FrequencyEncoder': #Perform Frequency encoding to convert categorical values into frequency encoded features completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) elif arrayOfModels[i][1] == 'BinaryEncoder': #Perform Binary encoding to convert categorical values into binary encoded features completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) #Feature Scaling if arrayOfModels[i][2] == 'Min-Max': #Perform MinMaxScaler to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Binarizing': #Perform Binarizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Normalizing': #Perform Normalizing to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) elif arrayOfModels[i][2] == 'Standardization': #Perform Standardization to scale the features of the dataset into same range completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) #Split the complete dataSet into training dataSet and testing dataSet featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) return completeEncodedAndScaledDataset
def getStatisticsOfData (dataSet): print("***** Start checking the statistics of the dataSet *****\n") labelName = getLabelName() #Number of rows and columns in the dataset print("***** Shape (number of rows and columns) in the dataset: ", dataSet.shape) #Total number of features in the dataset numberOfColumnsInTheDataset = len(dataSet.drop([labelName],axis=1).columns) #numberOfColumnsInTheDataset = len(dataSet.columns) print("***** Total number of features in the dataset: ",numberOfColumnsInTheDataset) #Total number of categorical featuers in the dataset categoricalFeaturesInTheDataset = list(set(dataSet.drop([labelName],axis=1).columns) - set(dataSet.drop([labelName],axis=1)._get_numeric_data().columns)) #categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns)) print("***** Number of categorical features in the dataset: ",len(categoricalFeaturesInTheDataset)) #Total number of numerical features in the dataset numericalFeaturesInTheDataset = list(dataSet.drop([labelName],axis=1)._get_numeric_data().columns) #numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns) print("***** Number of numerical features in the dataset: ",len(numericalFeaturesInTheDataset)) #Names of categorical features in the dataset print("\n***** Names of categorical features in dataset *****\n") printList(categoricalFeaturesInTheDataset,'Categorical features in dataset') #Names of numerical features in the dataset print("\n***** Names of numerical features in dataset *****\n") printList(numericalFeaturesInTheDataset,'Numerical features in the dataset') #Checking for any missing values in the data set anyMissingValuesInTheDataset = checkForMissingValues(dataSet) print("\n***** Are there any missing values in the data set: ", anyMissingValuesInTheDataset) anyDuplicateRecordsInTheDataset = checkForDulicateRecords(dataSet) print("\n***** Are there any duplicate records in the data set: ", anyDuplicateRecordsInTheDataset) #Check if there are any duplicate records in the data set if (anyDuplicateRecordsInTheDataset): dataSet = dataSet.drop_duplicates() print("Number of records in the dataSet after removing the duplicates: ", len(dataSet.index)) #How many number of different values for label that are present in the dataset print('\n****** Number of different values for label that are present in the dataset: ',dataSet[labelName].nunique()) #What are the different values for label in the dataset print('\n****** Here is the list of unique label types present in the dataset ***** \n') printList(list(dataSet[getLabelName()].unique()),'Unique label types in the dataset') #What are the different values in each of the categorical features in the dataset print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns)) numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns) for feature in categoricalFeaturesInTheDataset: uniq = np.unique(dataSet[feature]) print('\n{}: {} '.format(feature,len(uniq))) printList(dataSet[feature].unique(),'distinct values') print('\n****** Label distribution in the dataset *****\n') print(dataSet[labelName].value_counts()) print() print("\n***** End checking the statistics of the dataSet *****")