Esempio n. 1
0
def featureSelectionUsingExtraTreesClassifier(dataSetForFeatureSelection):
    print(
        "\n****** Start performing feature selection using ExtraTreesClassifier *****"
    )
    print("****** Falls under wrapper methods (feature importance) *****")

    labelName = getLabelName()

    #Applying feature encoding before applying the ExtraTreesClassification
    dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(
        dataSetForFeatureSelection)
    dataSetAfterFeatuerSelection = dataSetForFeatureSelection
    #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1]
    features = dataSetForFeatureSelection.drop([labelName], axis=1)
    label = dataSetForFeatureSelection[labelName]

    labelencoder = LabelEncoder()
    labelTransformed = labelencoder.fit_transform(label)

    print("****** ExtraTreesClassification is in progress *****")
    #Train using ExtraTreesClassifier
    trainedforest = ExtraTreesClassifier(n_estimators=700).fit(
        features, labelTransformed)
    importances = trainedforest.feature_importances_  #array with importances of each feature
    idx = np.arange(
        0,
        features.shape[1])  #create an index array, with the number of features
    features_to_keep = idx[importances > np.mean(
        importances
    )]  #only keep features whose importance is greater than the mean importance
    featureImportances = pd.Series(importances, index=features.columns)
    selectedFeatures = featureImportances.nlargest(len(features_to_keep))
    print("\n selectedFeatures after ExtraTreesClassification: ",
          selectedFeatures)
    print("****** Completed ExtraTreesClassification *****")

    #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions
    #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k')
    #selectedFeatures.plot(kind='barh')

    selectedFeaturesNames = selectedFeatures.keys()
    dataSetForFeatureSelection = dataSetForFeatureSelection.drop(
        selectedFeaturesNames, axis=1)
    dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop(
        dataSetForFeatureSelection.columns, axis=1)
    dataSetAfterFeatuerSelection[labelName] = label

    numberOfFeaturesInTheDatasetAfterFeatureSelection = len(
        dataSetAfterFeatuerSelection.columns)
    print('\n***** Number of columns in the dataSet after feature selection: ',
          len(dataSetAfterFeatuerSelection.columns))
    print('***** Columns in the dataSet after feature selection: \n',
          dataSetAfterFeatuerSelection.columns)
    print(
        "****** End performing feature selection using ExtraTreesClassifier *****"
    )
    return dataSetAfterFeatuerSelection
def splitCompleteDataSetIntoTrainingSetAndTestingSet(completeDataSet):
    labelName = getLabelName()
    label = completeDataSet[labelName]
    features = completeDataSet.drop(labelName, axis=1)
    featuresInPreProcessedTrainingDataSet, featuresInPreProcessedTestingDataSet, labelInPreProcessedTrainingDataSet, labelInPreProcessedTestingDataSet = train_test_split(
        features, label, test_size=0.4, random_state=42)
    print("features.shape: ", features.shape)
    print("label.shape: ", label.shape)
    return featuresInPreProcessedTrainingDataSet, featuresInPreProcessedTestingDataSet, labelInPreProcessedTrainingDataSet, labelInPreProcessedTestingDataSet
def featureEncodingUsingBinaryEncoder(dataSetForFeatureEncoding):
    print(
        "****** Start binary encoding on the categorical features in the given dataset *****"
    )

    labelName = getLabelName()
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop(
        [labelName], axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values

    print("****** Number of features before binary encoding: ",
          len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",
          len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",
          categoricalColumnNames)

    print(
        '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n'
    )
    label = dataSetForFeatureEncoding.drop(
        dataSetForFeatureEncoding.
        loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns,
        axis=1)
    for feature in categoricalColumnNames:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature, len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),
                  'distinct values')
        featureColumns = dataSetForFeatureEncoding.drop(
            dataSetForFeatureEncoding.
            loc[:, ~dataSetForFeatureEncoding.columns.isin([feature])].columns,
            axis=1)
        binaryEncoder = ce.BinaryEncoder(cols=[feature])
        binaryEncodedFeature = binaryEncoder.fit_transform(
            featureColumns, label)
        dataSetForFeatureEncoding = dataSetForFeatureEncoding.join(
            binaryEncodedFeature)
        dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature,
                                                                   axis=1)

    dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName,
                                                               axis=1)
    dataSetForFeatureEncoding[labelName] = label
    print("****** Number of features after binary encoding: ",
          len(dataSetForFeatureEncoding.columns))

    print(
        "****** End binary encoding on the categorical features in the given dataset *****\n"
    )
    return dataSetForFeatureEncoding
def featureEncodingUsingFrequencyEncoder(dataSetForFeatureEncoding):
    print(
        "****** Start frequency encoding on the categorical features in the given dataset *****"
    )

    labelName = getLabelName()
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop(
        [labelName], axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values

    print("****** Number of features before label encoding: ",
          len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",
          len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",
          categoricalColumnNames)

    print(
        '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n'
    )
    label = dataSetForFeatureEncoding.drop(
        dataSetForFeatureEncoding.
        loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns,
        axis=1)
    for feature in categoricalColumnNames:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature, len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),
                  'distinct values')
        frequencyEncoder = dataSetForFeatureEncoding.groupby(
            feature).size() / len(dataSetForFeatureEncoding)
        dataSetForFeatureEncoding.loc[:, feature +
                                      "_Encoded"] = dataSetForFeatureEncoding[
                                          feature].map(frequencyEncoder)
        dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature,
                                                                   axis=1)

    dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName,
                                                               axis=1)
    dataSetForFeatureEncoding[labelName] = label
    print("****** Number of features after frequency encoding: ",
          len(dataSetForFeatureEncoding.columns))

    print(
        "****** End frequency encoding on the categorical features in the given dataset *****\n"
    )
    return dataSetForFeatureEncoding
def featureEncodingUsingOneHotEncoder(dataSetForFeatureEncoding):
    print(
        "****** Start one hot encoding on the categorical features in the given dataset *****"
    )

    labelName = getLabelName()
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop(
        [labelName], axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values

    print("****** Number of features before one hot encoding: ",
          len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",
          len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",
          categoricalColumnNames)

    print(
        '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n'
    )
    categoricalFeaturesInTheDataset = list(
        set(dataSetForFeatureEncoding.columns) -
        set(dataSetForFeatureEncoding._get_numeric_data().columns))
    numericalFeaturesInTheDataset = list(
        dataSetForFeatureEncoding._get_numeric_data().columns)
    for feature in categoricalFeaturesInTheDataset:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature, len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),
                  'distinct values')

    #Using get_dummies function to get the dummy variables for the categorical columns
    onHotEncodedDataSet = pd.get_dummies(dataSetForFeatureEncoding,
                                         columns=categoricalColumnNames,
                                         prefix=categoricalColumnNames)

    #Move the label column to the end
    label = onHotEncodedDataSet.pop(labelName)
    onHotEncodedDataSet[labelName] = label
    numberOfColumnsInOneHotEncodedDataset = len(onHotEncodedDataSet.columns)
    print("****** Number of features after one hot encoding: ",
          numberOfColumnsInOneHotEncodedDataset)

    print(
        "****** End one hot encoding on the categorical features in the given dataset *****\n"
    )
    return onHotEncodedDataSet
def featureScalingUsingStandardScalar(dataSetForFeatureScaling):
    print(
        "****** Start feature scaling of the features present in the dataset using StandardScalar *****"
    )

    numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns)
    dataSetInArrayFormat = dataSetForFeatureScaling.values

    #Remove the label column from the dataset
    labelName = getLabelName()
    label = dataSetForFeatureScaling.pop(labelName)

    print(dataSetInArrayFormat)
    features = dataSetInArrayFormat[:, 0:numberOfColumnsInEncodedDataset - 1]
    print(
        "\n****** Number of features in the dataset before performing scaling: ",
        np.size(features, 1))
    print(
        "\n****** Features in the dataset before performing scaling ***** \n",
        features)

    #Perform feature scaling
    scaler = StandardScaler()
    scaledFeatures = scaler.fit_transform(features)
    print(
        "\n****** Number of features in the dataset after performing scaling: ",
        np.size(scaledFeatures, 1))
    print("\n****** Features in the dataset after performing scaling ***** \n",
          scaledFeatures)

    #Convert from array format to dataframe
    scaledFeatures = pd.DataFrame(scaledFeatures,
                                  columns=dataSetForFeatureScaling.columns)
    scaledFeatures = scaledFeatures.reset_index(drop=True)
    label = label.reset_index(drop=True)
    scaledFeatures[labelName] = label
    print("scaledFeatures.head(): ", scaledFeatures.head())
    print("scaledFeatures.shape: ", scaledFeatures.shape)

    print(
        "\n****** End of feature scaling of the features present in the dataset using StandardScalar *****\n"
    )
    return scaledFeatures
Esempio n. 7
0
def featureSelectionUsingChisquaredTest(dataSetForFeatureSelection):
    print(
        "\n****** Start performing feature selection using ChisquaredTest *****"
    )
    print("****** Falls under filter methods (univariate selection) *****")

    numberOfFeatureToBeSelected = 10
    labelName = getLabelName()

    #To be able to apply Chi-squared test
    dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(
        dataSetForFeatureSelection)
    dataSetAfterFeatuerSelection = dataSetForFeatureSelection

    #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1]
    features = dataSetForFeatureSelection.drop([labelName], axis=1)
    label = dataSetForFeatureSelection[labelName]

    #Apply SelectKBest class to extract top 10 best features
    bestfeatures = SelectKBest(score_func=chi2, k=numberOfFeatureToBeSelected)
    fitBestfeatures = bestfeatures.fit(features, label)
    columns = pd.DataFrame(features.columns)
    scores = pd.DataFrame(fitBestfeatures.scores_)
    #concat two dataframes for better visualization
    scoresOfBestFeatures = pd.concat([columns, scores], axis=1)
    scoresOfBestFeatures.columns = ['Features', 'Score']
    print("\n***** Scores for each feature in the dataset are *****")
    print(scoresOfBestFeatures.nlargest(numberOfFeatureToBeSelected, 'Score'))

    mask = fitBestfeatures.get_support()
    for j in range(0, len(mask)):
        if (mask[j] == False):
            dataSetAfterFeatuerSelection.pop(features.columns[j])

    numberOfFeaturesInTheDatasetAfterFeatureSelection = len(
        dataSetAfterFeatuerSelection.columns)
    print('***** Number of columns in the dataSet after feature selection: ',
          len(dataSetAfterFeatuerSelection.columns))
    print('***** Columns in the dataSet after feature selection: \n',
          dataSetAfterFeatuerSelection.columns)
    print("****** End performing feature selection using ChisquaredTest *****")

    return dataSetAfterFeatuerSelection
Esempio n. 8
0
def featureSelectionUsingTheilU(dataSetForFeatureSelection):
    print("\n****** Start performing feature selection using TheilU *****")
    print(
        "****** Falls under the group of techniques that use correlation matrix with Heatmap *****"
    )

    labelName = getLabelName()
    label = dataSetForFeatureSelection[labelName]

    theilu = pd.DataFrame(index=[labelName],
                          columns=dataSetForFeatureSelection.columns)
    columns = dataSetForFeatureSelection.columns
    dataSetAfterFeatuerSelection = dataSetForFeatureSelection

    for j in range(0, len(columns)):
        u = theil_u(label.tolist(),
                    dataSetForFeatureSelection[columns[j]].tolist())
        theilu.loc[:, columns[j]] = u
        if u < 0.50:
            dataSetAfterFeatuerSelection.pop(columns[j])

    print(
        '***** Ploting the uncertainty coefficient between the target and each feature *****'
    )
    theilu.fillna(value=np.nan, inplace=True)
    plt.figure(figsize=(30, 1))
    sns.heatmap(theilu, annot=True, fmt='.2f')
    plt.show()

    numberOfFeaturesInTheDatasetAfterFeatureSelection = len(
        dataSetAfterFeatuerSelection.columns)
    print('***** Number of columns in the dataSet after feature selection: ',
          len(dataSetAfterFeatuerSelection.columns))
    print('***** Columns in the dataSet after feature selection: \n',
          dataSetAfterFeatuerSelection.columns)
    print("****** End performing feature selection using TheilU *****")
    return dataSetAfterFeatuerSelection
def featureEncodingUsingLabelEncoder(dataSetForFeatureEncoding):
    print(
        "****** Start label encoding on the categorical features in the given dataset *****"
    )

    labelName = getLabelName()
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop(
        [labelName], axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values

    print("****** Number of features before label encoding: ",
          len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",
          len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",
          categoricalColumnNames)

    print(
        '\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n'
    )
    labelEncoder = LabelEncoder()
    for feature in categoricalColumnNames:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature, len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),
                  'distinct values')
        dataSetForFeatureEncoding[feature] = labelEncoder.fit_transform(
            dataSetForFeatureEncoding[feature])
    print("****** Number of features after label encoding: ",
          len(dataSetForFeatureEncoding.columns))

    print(
        "****** End label encoding on the categorical features in the given dataset *****\n"
    )
    return dataSetForFeatureEncoding
def performPreprocessingBuildModelsAndEvaluateAccuracy(trainingDataSet, testingDataSet, arrayOfModels):
    for i in range(1,len(arrayOfModels)):
        print('***************************************************************************************************************************')
        print('********************************************* Building Model-', i ,' As Below *************************************************')
        print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], ' \n\t -- Classification: \t ', arrayOfModels[i][3], '\n')
 
        trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets()
        trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath)
        testingDataSet = loadCSV(testingFileNameWithAbsolutePath)

        labelName = getLabelName()
        label = trainingDataSet[labelName]

        #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets
        #the values in the categorical columns in test dataset and train dataset are being different this causes issues while
        #applying classification techniques
        completeDataSet = pd.concat(( trainingDataSet, testingDataSet ))

        #difficultyLevel = completeDataSet.pop('difficulty_level')
        
        print("completeDataSet.shape: ",completeDataSet.shape)
        print("completeDataSet.head: ",completeDataSet.head())

        #Feature Selection
        if arrayOfModels[i][0] == 'TheilsU':
            #Perform feature selection using TheilU
            completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet)
        elif arrayOfModels[i][0] == 'Chi-SquaredTest':
            #Perform feature selection using Chi-squared Test
            completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet)
        elif arrayOfModels[i][0] == 'RandomForestClassifier':
            #Perform feature selection using RandomForestClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet)
        elif arrayOfModels[i][0] == 'ExtraTreesClassifier':
            #Perform feature selection using ExtraTreesClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet)
        
        #Feature Encoding        
        if arrayOfModels[i][1] == 'LabelEncoder':
            #Perform lable encoding to convert categorical values into label encoded features
            completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'OneHotEncoder':
            #Perform OnHot encoding to convert categorical values into one-hot encoded features
            completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'FrequencyEncoder':
            #Perform Frequency encoding to convert categorical values into frequency encoded features
            completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'BinaryEncoder':
            #Perform Binary encoding to convert categorical values into binary encoded features
            completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection)

        #Feature Scaling        
        if arrayOfModels[i][2] == 'Min-Max':
            #Perform MinMaxScaler to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Binarizing':
            #Perform Binarizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Normalizing':
            #Perform Normalizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Standardization':
            #Perform Standardization to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet)
        
        #Split the complete dataSet into training dataSet and testing dataSet
        featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset)
        
        trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False)
        testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False)

        #Classification                
        if arrayOfModels[i][3] == 'DecisonTree':
            #Perform classification using DecisionTreeClassifier
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'RandomForestClassifier':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'ExtraTreesClassifier':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'LogisticRegressionRegression':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'LinearDiscriminantAnalysis':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'GuassianNaiveBayes':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)
        elif arrayOfModels[i][3] == 'KNN':
            classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset)

        arrayOfModels[i].append(trainingAccuracyScore)
        arrayOfModels[i].append(testingAccuracyScore)
        
        modelName = arrayOfModels[i][0]+"_"+arrayOfModels[i][1]+"_"+arrayOfModels[i][2]+"_"+arrayOfModels[i][3]
        modelFileName = getPathToGenerateModels() + modelName+".pkl"
        arrayOfModels[i].append(modelName)
        arrayOfModels[i].append(modelFileName)
        #Save the model to file
        joblib.dump(classifier, modelFileName)
def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels):
    for i in range(0,len(arrayOfModels)):
        print('***************************************************************************************************************************')
        print('********************************************* Building Model-', i ,' As Below *************************************************')
        print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n')
 
        trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets()
        trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath)
        testingDataSet = loadCSV(testingFileNameWithAbsolutePath)

        labelName = getLabelName()
        label = trainingDataSet[labelName]

        #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets
        #the values in the categorical columns in test dataset and train dataset are being different this causes issues while
        #applying classification techniques
        completeDataSet = pd.concat(( trainingDataSet, testingDataSet ))

        #difficultyLevel = completeDataSet.pop('difficulty_level')
        
        print("completeDataSet.shape: ",completeDataSet.shape)
        print("completeDataSet.head: ",completeDataSet.head())

        #Feature Selection
        if arrayOfModels[i][0] == 'TheilsU':
            #Perform feature selection using TheilU
            completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet)
        elif arrayOfModels[i][0] == 'Chi-SquaredTest':
            #Perform feature selection using Chi-squared Test
            completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet)
        elif arrayOfModels[i][0] == 'RandomForestClassifier':
            #Perform feature selection using RandomForestClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet)
        elif arrayOfModels[i][0] == 'ExtraTreesClassifier':
            #Perform feature selection using ExtraTreesClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet)
        
        #Feature Encoding        
        if arrayOfModels[i][1] == 'LabelEncoder':
            #Perform lable encoding to convert categorical values into label encoded features
            completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'OneHotEncoder':
            #Perform OnHot encoding to convert categorical values into one-hot encoded features
            completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'FrequencyEncoder':
            #Perform Frequency encoding to convert categorical values into frequency encoded features
            completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection)
        elif arrayOfModels[i][1] == 'BinaryEncoder':
            #Perform Binary encoding to convert categorical values into binary encoded features
            completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection)

        #Feature Scaling        
        if arrayOfModels[i][2] == 'Min-Max':
            #Perform MinMaxScaler to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Binarizing':
            #Perform Binarizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Normalizing':
            #Perform Normalizing to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet)
        elif arrayOfModels[i][2] == 'Standardization':
            #Perform Standardization to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet)
        
        #Split the complete dataSet into training dataSet and testing dataSet
        featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset)
        
        trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False)
        testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False)
    
    return 	completeEncodedAndScaledDataset
def getStatisticsOfData (dataSet):
    print("***** Start checking the statistics of the dataSet *****\n")
    
    labelName = getLabelName()
    #Number of rows and columns in the dataset
    print("***** Shape (number of rows and columns) in the dataset: ", dataSet.shape)
    
    #Total number of features in the dataset
    numberOfColumnsInTheDataset = len(dataSet.drop([labelName],axis=1).columns)
    #numberOfColumnsInTheDataset = len(dataSet.columns)
    print("***** Total number of features in the dataset: ",numberOfColumnsInTheDataset)
    
    #Total number of categorical featuers in the dataset
    categoricalFeaturesInTheDataset = list(set(dataSet.drop([labelName],axis=1).columns) - set(dataSet.drop([labelName],axis=1)._get_numeric_data().columns))
    #categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns))
    print("***** Number of categorical features in the dataset: ",len(categoricalFeaturesInTheDataset))
  
    #Total number of numerical features in the dataset
    numericalFeaturesInTheDataset = list(dataSet.drop([labelName],axis=1)._get_numeric_data().columns)
    #numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns)
    print("***** Number of numerical features in the dataset: ",len(numericalFeaturesInTheDataset))

    #Names of categorical features in the dataset
    print("\n***** Names of categorical features in dataset *****\n")
    printList(categoricalFeaturesInTheDataset,'Categorical features in dataset')

    #Names of numerical features in the dataset
    print("\n***** Names of numerical features in dataset *****\n")
    printList(numericalFeaturesInTheDataset,'Numerical features in the dataset')
    
    #Checking for any missing values in the data set
    anyMissingValuesInTheDataset = checkForMissingValues(dataSet)
    print("\n***** Are there any missing values in the data set: ", anyMissingValuesInTheDataset)
      
    anyDuplicateRecordsInTheDataset = checkForDulicateRecords(dataSet)
    print("\n***** Are there any duplicate records in the data set: ", anyDuplicateRecordsInTheDataset)
    #Check if there are any duplicate records in the data set
    if (anyDuplicateRecordsInTheDataset):
        dataSet = dataSet.drop_duplicates()
        print("Number of records in the dataSet after removing the duplicates: ", len(dataSet.index))

    #How many number of different values for label that are present in the dataset
    print('\n****** Number of different values for label that are present in the dataset: ',dataSet[labelName].nunique())
    #What are the different values for label in the dataset
    print('\n****** Here is the list of unique label types present in the dataset ***** \n')
    printList(list(dataSet[getLabelName()].unique()),'Unique label types in the dataset')

    #What are the different values in each of the categorical features in the dataset
    print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n')
    categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns))
    numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns)
    for feature in categoricalFeaturesInTheDataset:
        uniq = np.unique(dataSet[feature])
        print('\n{}: {} '.format(feature,len(uniq)))
        printList(dataSet[feature].unique(),'distinct values')
        
    print('\n****** Label distribution in the dataset *****\n')
    print(dataSet[labelName].value_counts())
    print()

    print("\n***** End checking the statistics of the dataSet *****")