Example #1
0
def test_calculate_class_parameters():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = {
        0: [(2.7420144012, 0.9265683289298018, 5),
            (3.0054686692, 1.1073295894898725, 5)],
        1: [(7.6146523718, 1.2344321550313704, 5),
            (2.9914679790000003, 1.4541931384601618, 5)]
    }

    assert classifier.calculate_class_parameters(dataset) == results_dataset
Example #2
0
class Iris:
    """

    Works on iris.csv dataset and interactively performs the following actions:\n
    1. Classify new data entered by user.\n
    2. Calculate the algorithm implementation accuracy.\n
    3. Show dataset description (iris.names file).\n
    4. Show dataset rows.

    """
    def __init__(self):

        self.dataset_filename = 'datasets/iris.csv'
        self.description_filename = 'datasets/iris.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
        self.class_map = dict()

    def data_preprocessing(self):
        """

        Converts class names (strings) to ints and class values to floats.

        Args:
            None.

        Returns:
            Nothing.

        """

        seed(1)

        for i in range(len(self.dataset[0]) - 1):
            self.nbc.convert_class_values_to_floats(self.dataset, i)

        self.class_map = self.nbc.map_class_names_to_ints(
            self.dataset,
            len(self.dataset[0]) - 1)

    def classify_data(self):
        """

        Creates a new row with values inputted by the user, then classifies it to the proper class
        using Naive Bayes Classifier algorithm.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\nEnter the data to be classified.\n')

        attributes = {
            'Sepal length [cm]: ': 0.0,
            'Sepal width [cm]: ': 0.0,
            'Petal length [cm]: ': 0.0,
            'Petal width [cm]: ': 0.0
        }

        for attr in attributes:

            correct_input = False

            while correct_input == False:

                try:
                    attr_value = float(input(attr))
                    correct_input = True
                except:
                    print(
                        'Incorrect value! Please enter an integer or a floating point number.'
                    )

            attributes[attr] = attr_value

        print('\nEntered attributes:\n')

        for attr in attributes:
            print(f'{attr}{attributes[attr]}')

        print()

        confirm_sign = ''

        while confirm_sign not in ['y', 'Y', 'n', 'N']:
            confirm_sign = input('Confirm (y/n): ')

        if confirm_sign in ['n', 'N']:
            return

        model = self.nbc.calculate_class_parameters(self.dataset)
        label = self.nbc.predict(model, list(attributes.values()))

        for key, value in self.class_map.items():
            if value == label:
                print(f'\nThe entered entity was classified as: {key}')
                break

    def calculate_accuracy(self, n_folds=5):
        """

        Calculates algorithm accuracy by using evaluate_algorithm() function.

        Args:
            n_folds (int)
                Number of folds used in the k-fold cross validation split algorithm.

        Returns:
            accuracy
                Calculated classifier accuracy in percent.

        """

        scores = self.nbc.evaluate_algorithm(self.dataset, n_folds)

        print(
            '\n\nCalculating the accuracy of the classifier using the iris.csv dataset...'
        )
        print('\nResampling: k-fold cross validation split')

        accuracy = (sum(scores) / float(len(scores)))
        print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n')

        return accuracy

    def show_dataset_description(self):
        """

        Prints the 'iris.names' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.description_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for word in row:
                    print(word, end='')
                print()

    def show_dataset_rows(self):
        """

        Prints the 'iris.csv' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.dataset_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for i in range(len(row) - 1):
                    print(row[i], end=',')
                print(row[len(row) - 1])

    def run(self):
        """

        Creates the interactive menu from which the user can execute the actions handled
        by the other methods in this class.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\n=================================')
        print('          Iris dataset')
        print('=================================\n')

        self.data_preprocessing()

        returned_from_function = True

        while True:

            if returned_from_function == True:
                print('\nChoose the action:')
                print('\n1. Classify new data.')
                print('2. Calculate algorithm accuracy.')
                print('3. Show dataset description.')
                print('4. Show dataset rows.')
                print('5. Go back to the main menu.\n')

            returned_from_function = False

            choice = input('Choice: ')

            if choice not in ['1', '2', '3', '4', '5']:
                print('Wrong choice! Please choose option 1-5.')

            elif choice == '1':

                try:
                    self.classify_data()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '2':

                try:
                    self.calculate_accuracy()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '3':

                try:
                    self.show_dataset_description()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '4':

                try:
                    self.show_dataset_rows()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '5':
                break

            else:
                raise
class PimaIndiansDiabetes:
    """

    Works on pima-indians-diabetes.csv dataset and interactively performs the following actions:\n
    1. Classify new data entered by user.\n
    2. Calculate the algorithm implementation accuracy.\n
    3. Show dataset description (pima-indians-diabetes.names file).\n
    4. Show dataset rows.

    """
    def __init__(self):

        self.dataset_filename = 'datasets/pima-indians-diabetes.csv'
        self.description_filename = 'datasets/pima-indians-diabetes.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)

    def data_preprocessing(self):
        """

        Converts class names (strings) to ints and class values to floats.

        Args:
            None.

        Returns:
            Nothing.

        """

        for i in range(len(self.dataset[0]) - 1):
            self.nbc.convert_class_values_to_floats(self.dataset, i)

        self.nbc.map_class_names_to_ints(self.dataset,
                                         len(self.dataset[0]) - 1,
                                         numbers_already=True)

    def classify_data(self):
        """

        Creates a new row with values inputted by the user, then classifies it to the proper class
        using Naive Bayes Classifier algorithm.

        Args:
            None.

        Returns:
            Nothing.

        """

        print('\nEnter the data to be classified.\n')

        attributes = {
            'Number of times pregnant: ':
            0.0,
            'Plasma glucose concentration a 2 hours in an oral glucose tolerance test: ':
            0.0,
            'Diastolic blood pressure (mm Hg): ':
            0.0,
            'Triceps skin fold thickness (mm): ':
            0.0,
            '2-Hour serum insulin (mu U/ml): ':
            0.0,
            'Body mass index (weight in kg/(height in m)^2): ':
            0.0,
            'Diabetes pedigree function: ':
            0.0,
            'Age (years): ':
            0.0
        }

        for attr in attributes:

            correct_input = False

            while correct_input == False:

                try:
                    attr_value = float(input(attr))
                    correct_input = True
                except:
                    print(
                        'Incorrect value! Please enter an integer or a floating point number.'
                    )

            attributes[attr] = attr_value

        print('\nEntered attributes:\n')

        for attr in attributes:
            print(f'{attr}{attributes[attr]}')

        print()

        confirm_sign = ''

        while confirm_sign not in ['y', 'Y', 'n', 'N']:
            confirm_sign = input('Confirm (y/n): ')

        if confirm_sign in ['n', 'N']:
            return

        model = self.nbc.calculate_class_parameters(self.dataset)
        label = self.nbc.predict(model, list(attributes.values()))

        # Original dataset contains class names represented as numbers,
        # so it's needed to print the labels explicitly
        if label == 0:
            print(f'\nThe entered entity was classified as: Negative')
        elif label == 1:
            print(f'\nThe entered entity was classified as: Positive')
        else:
            raise

    def calculate_accuracy(self, n_folds=5):
        """

        Calculates algorithm accuracy by using evaluate_algorithm() function.

        Args:
            n_folds (int)
                Number of folds used in the k-fold cross validation split algorithm.

        Returns:
            accuracy
                Calculated classifier accuracy in percent.

        """

        scores = self.nbc.evaluate_algorithm(self.dataset, n_folds)

        print(
            '\n\nCalculating the accuracy of the classifier using the pima-indians-diabetes.csv dataset...'
        )
        print('\nResampling: k-fold cross validation split')

        accuracy = (sum(scores) / float(len(scores)))
        print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n')

        return accuracy

    def show_dataset_description(self):
        """

        Prints the 'pima-indians-diabetes.names' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.description_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for word in row:
                    print(word, end='')
                print()

    def show_dataset_rows(self):
        """

        Prints the 'pima-indians-diabetes.csv' file to the console output.

        Args:
            None.

        Returns:
            Nothing.

        """

        with open(self.dataset_filename, 'r') as f:

            csv_reader = csv.reader(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            for row in csv_reader:
                for i in range(len(row) - 1):
                    print(row[i], end=',')
                print(row[len(row) - 1])

    def run(self):
        """

        Creates the interactive menu from which the user can execute the actions handled
        by the other methods in this class.

        Args:
            None.

        Returns:
            Nothing.

        """

        seed(1)

        print('\n=================================')
        print('   Pima Indians Diabetes dataset')
        print('=================================')

        self.data_preprocessing()

        returned_from_function = True

        while True:

            if returned_from_function == True:
                print('\nChoose the action:')
                print('\n1. Classify new data.')
                print('2. Calculate algorithm accuracy.')
                print('3. Show dataset description.')
                print('4. Show dataset rows.')
                print('5. Go back to the main menu.\n')

            returned_from_function = False

            choice = input('Choice: ')

            if choice not in ['1', '2', '3', '4', '5']:
                print('Wrong choice! Please choose option 1-5.')

            elif choice == '1':

                try:
                    self.classify_data()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '2':

                try:
                    self.calculate_accuracy()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '3':

                try:
                    self.show_dataset_description()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '4':

                try:
                    self.show_dataset_rows()
                    returned_from_function = True
                    continue
                except KeyboardInterrupt:
                    returned_from_function = True
                    continue

            elif choice == '5':
                break

            else:
                raise