Python EuclideanDistance Exemples, ml_math.euclidean_distance.EuclideanDistance Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : k_nearest_neighbor_regression.py Projet : mxlei01/Simple-ML

    def __init__(self):
        """Constructor for KNearestNeighborRegression to setup EuclideanDistance.

        Constructor for the KNN class, mainly used to setup EuclideanDistance.

        """
        self.euclidean_distance = EuclideanDistance()

Exemple #2

0

Afficher le fichier

Fichier : test_k_nearest_neighbor_regression.py Projet : mxlei01/Simple-ML

    def setUp(self):
        """Constructor for TestKNearestNeighborRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.knn = KNearestNeighborRegression()
        self.euclidean_distance = EuclideanDistance()
        self.determine_k_knn = DetermineKKnn()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/'
                                          'kc_house_data_small_train.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only validation data
        self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv',
                                          dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int)

Exemple #3

0

Afficher le fichier

Fichier : k_nearest_neighbor_regression.py Projet : mxlei01/Simple-ML

class KNearestNeighborRegression:

    """Class that uses KNN to compute Regression.

    This class uses K Nearest Neighbor to compute regression by utilizing Euclidean Distance.

    Attributes:
        euclidean_distance (EuclideanDistance): EuclideanDistance class that provide a function to compute euclidean
            distance.

    """

    def __init__(self):
        """Constructor for KNearestNeighborRegression to setup EuclideanDistance.

        Constructor for the KNN class, mainly used to setup EuclideanDistance.

        """
        self.euclidean_distance = EuclideanDistance()

    def k_nearest_neighbor_regression(self, k, feature_matrix_training, feature_vector_query):
        """Computes K Nearest neighbor Regression.

        Computes KNN by computing euclidean distance for each query point, and returning the closest points.

        Args:
            k (int): Amount of neighbors.
            feature_matrix_training (numpy.matrix): A matrix of training points.
            feature_vector_query (numpy.array): Query point array.

        Returns:
            numpy.array: Indices of the feature_matrix_training that is closest to feature_vector_query in sorted order.

        """
        # Compute the euclidean distance on each item, and since the return from euclidean distance is an
        # array which we can use to sort based on the value in ascending order. This will give us indices of
        # feature_matrix_training.
        return np.argsort(self.euclidean_distance.euclidean_distance_cmp_one_value(feature_matrix_training,
                                                                                   feature_vector_query))[0:k]

    def predict_k_nearest_neighbor_regression(self, k, feature_matrix_training, output_train, feature_vector_query):
        """Predict KNN output by taking average of K points.

        Predicts the output of the k_nearest_neighbor_regression by taking the mean of the result from
        output where we get the indices from nearest knn.

        Args:
            k (int): Amount of neighbors.
            feature_matrix_training (numpy.matrix) : A matrix of training points.
            feature_vector_query (numpy.array): Query point array.
            output_train (numpy.array): Outputs for training data.

        Returns:
            float: Average value of the knn returned indexes.

        """
        # Compute the knn, then use the indices with the output to get the predicted values, and then
        # perform mean for all columns (axis=0)
        return np.mean(output_train[self.k_nearest_neighbor_regression(k, feature_matrix_training,
                                                                       feature_vector_query)],
                       axis=0)

    def predict_k_nearest_neighbor_all_regression(self, k, feature_matrix_training, output_train,
                                                  feature_matrix_query_set):
        """Predicts KNN output for each query set.

        Predicts the output of multiple k_nearest_neighbor_regression by using the predict_k_nearest_neighbor_regression
        function. Each row of the feature_matrix_query_set is computed for mean.

        Args:
            k (int): Amount of neighbors.
            feature_matrix_training (numpy.matrix): A matrix of training points.
            feature_matrix_query_set (list of float) : A list of query points.
            output_train (numpy.array): Outputs for training data.

        Returns:
            k_nn_predict_multiple (list): List of average value of the output using k_nn_indices that corresponds to
                each query point.

        """
        # For each feature_matrix_query_set which are rows of query point, compute the average knn value\
        # then return a list
        return [self.predict_k_nearest_neighbor_regression(k, feature_matrix_training, output_train, vector_query)
                for vector_query in feature_matrix_query_set]

Exemple #4

0

Afficher le fichier

Fichier : test_k_nearest_neighbor_regression.py Projet : mxlei01/Simple-ML

class TestKNearestNeighborRegression(unittest.TestCase):

    """Tests for TestKNearestNeighborRegression.

    Uses housing data to test KNearestNeighborRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestKNearestNeighborRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.knn = KNearestNeighborRegression()
        self.euclidean_distance = EuclideanDistance()
        self.determine_k_knn = DetermineKKnn()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/'
                                          'kc_house_data_small_train.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only validation data
        self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv',
                                          dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int)
        self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int)

    def test_01_compute_euclidean_distance(self):
        """Tests Euclidean distance.

        Tests Euclidean distance and compare it with known values.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, _ = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Compute the euclidean distance
        distance = self.euclidean_distance.euclidean_distance(features_test[0], features_train[9])

        # Assert that both are equal
        self.assertEqual(round(distance, 3), round(0.059723593716661257, 3))

    def test_02_compute_euclidean_distance_query_point(self):
        """Tests Euclidean distance with a set of query points.

        Test to compute euclidean distance from a query point to multiple points in the training set

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Determine the smallest euclidean distance set we get
        smallest = sys.maxsize
        smallest_index = 0
        for index, val in enumerate(self.euclidean_distance.euclidean_distance_cmp_one_value(features_train,
                                                                                             features_test[2])):
            if val < smallest:
                smallest = val
                smallest_index = index

        # Assert that we are getting the right prediction (for 1-NN neighbor)
        self.assertEqual(round(smallest, 8), round(0.00286049526751, 8))
        self.assertEqual(output_train[smallest_index], 249000)
        self.assertEqual(smallest_index, 382)

    def test_03_compute_knn(self):
        """Tests knn regression algorithm.

        Tests the knn algorithm and compare it with known values.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1)
        # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list,
        #                                                                    output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        features_test = features_test / norms
        # features_valid = features_valid / norms

        # Assert that the array is the closest with the 3rd house in features_test
        self.assertTrue(np.array_equal(self.knn.k_nearest_neighbor_regression(4, features_train, features_test[2]),
                                       np.array([382, 1149, 4087, 3142])))

        # Assert that the 413987.5 is the correct prediction
        self.assertEqual(self.knn.predict_k_nearest_neighbor_regression(4, features_train,
                                                                        output_train, features_test[2]),
                         413987.5)

        # Compute the lowest predicted value
        lowest_predicted = sys.maxsize
        lowest_predicted_index = 0
        for index, val in enumerate(self.knn.predict_k_nearest_neighbor_all_regression(10, features_train,
                                                                                       output_train,
                                                                                       features_test[0:10])):
            if val < lowest_predicted:
                lowest_predicted = val
                lowest_predicted_index = index

        # Assert that the few values such as lowest predicted values and index are the one we expect
        self.assertEqual(lowest_predicted, 350032.0)
        self.assertEqual(lowest_predicted_index, 6)

    def test_03_compute_best_k(self):
        """Compute best K for KNN Regression.

        Compute best K using K Fold Cross Validation.

        """
        # List of features to convert to numpy
        feature_list = ['bedrooms',
                        'bathrooms',
                        'sqft_living',
                        'sqft_lot',
                        'floors',
                        'waterfront',
                        'view',
                        'condition',
                        'grade',
                        'sqft_above',
                        'sqft_basement',
                        'yr_built',
                        'yr_renovated',
                        'lat',
                        'long',
                        'sqft_living15',
                        'sqft_lot15']

        # Output to convert to numpy
        output = ['price']

        # Extract features and output for train, test, and validation set
        features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1)
        # features_test, output_test = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list,
        #                                                                  output, 1)
        features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, output, 1)

        # Normalize our training features, and then normalize the test set and valid set
        features_train, norms = self.normalize_features.l2_norm(features_train)
        # features_test = features_test / norms
        features_valid = features_valid / norms

        # Compute the lowest K and lowest K's RSS
        low_rss, low_idx = self.determine_k_knn.determine_k_knn(self.knn.predict_k_nearest_neighbor_all_regression,
                                                                1, 16, {"features_train": features_train,
                                                                        "features_valid": features_valid,
                                                                        "output_train": output_train,
                                                                        "output_valid": output_valid})

        # Assert that the lowest k and rss is correct
        self.assertEqual(round(low_rss, -13), round(6.73616787355e+13, -13))
        self.assertEqual(low_idx, 8)