def setUp(self): self.g1 = [ [2, 2], [2, 5], [6, 5], [7, 3], [4, 7], [6, 4], [5, 3], [4, 6], [2, 5], [1, 3], ] self.g2 = [ [6, 5], [7, 4], [8, 7], [5, 6], [5, 4], ] self.h1 = list(linearalg.new_matrix(self.g1).tr()) self.h2 = list(linearalg.new_matrix(self.g2).tr()) self.cov1 = linearalg.new_matrix([ [3.89, 0.13],[0.13, 2.21] ]) self.cov2 = linearalg.new_matrix([ [1.36, 0.56],[0.56, 1.36] ]) self.expected_cov = [self.cov1, self.cov2]
def testMahalanobisDistance2d_MATLAB(self): # MATLAB code: # g1 = [ 2 2; 2 5; 6 5; 7 3; 4 7; 6 4; 5 3; 4 6; 2 5; 1 3; ]; # g2 = [ 6 5; 7 4; 8 7; 5 6; 5 4; ]; # mahal(g1, g2) g1 = [ [2, 2], [2, 5], [6, 5], [7, 3], [4, 7], [6, 4], [5, 3], [4, 6], [2, 5], [1, 3], ] g2 = [ [6, 5], [7, 4], [8, 7], [5, 6], [5, 4], ] e = [ 11.9083, 12.0333, 0.0333, 4.9083, 8.0333, 0.9083, 2.9083, 4.9083, 12.0333, 15.9083 ] s = linearalg.new_matrix(g2).covariance_by_cols(population_variance=False) for i, g in enumerate(g1): self.assertAlmostEqual(e[i], distance.squared_mahalanobis([g], g2, cov=s), 4)
def squared_mahalanobis_1d(u, v, cov=None, population_variance=False): """ Returns the *squared* Mahalanobis distance between vectors `u` and `v`. `u` and `v` are assumed to be single-dimension, i.e., a simple list or Vector of values. `cov` is the covariance matrix. If not given, the pooled covariance of `u` and `v` are used. If the pooled covariance is calculated (i.e., `cov` is not given), then if `population_variance` is False (default), the data are treated as samples rather than a population. If only relative distances are needed (as they are in most cases), then this function should be preferred over `mahalanobis_1d(u, v)` which has the added computational expense of taking the square root. For more details and examples, see `dendropy.mathlib.distance.squared_mahalanobis()`. >>> u = [1, 2, 2, 4, 1, 4] >>> v = [2, 1, 1, 0, 2, 1] >>> print squared_mahalanobis_1d(u, v) 1.3800154321 """ u = linearalg.new_matrix([[i] for i in u]) v = linearalg.new_matrix([[i] for i in v]) return squared_mahalanobis(u, v, cov=cov, population_variance=population_variance)
def testCovarianceByRows(self): for i, x in enumerate([self.h1, self.h2]): x = linearalg.new_matrix(x) s = x.covariance_by_rows(population_variance=True) assert self.expected_cov[i] == s
def squared_mahalanobis(u, v, cov=None, population_variance=False): """ Returns the *squared* Mahalanobis distance between matrices `u` and `v`. `u` and `v` must be 2-dimensional, and have the same number of columns (though they can have different number of rows). That is, they must be a list of lists, and the lengths of the inner lists must be equal, though the length of the outer lists can be differnt). If there simple vectors of values, they should be represented as lists of multiple single-element lists, or use the convenience function `squared_mahalanobis_1d(u, v)`. `cov` is the covariance matrix. If not given, the pooled covariance of `u` and `v` are used. If the pooled covariance is calculated (i.e., `cov` is not given), then if `population_variance` is False (default), the data are treated as samples rather than a population. If only relative distances are needed (as they are in most cases), then this function should be preferred over `mahalanobis(u, v)` which has the added computational expense of taking the square root. The following examples calculate the Mahalanobis distances between matrices, using the pooled covariances:: >>> u = [ [2, 3.14, 1.3], [1, 4, 5] ] >>> v = [ [4, 1, 1], [5, 3, 2], [1, 3, 4], [1, 4, 4] ] >>> print squared_mahalanobis(u, v) 2.15009570304 >>> u = [ [2, 5], [5, 7] ] >>> v = [ [1, 5], [4, 8] ] >>> print squared_mahalanobis(u, v) 28.8888888889 >>> # for single column, you can use ... `squared_mahalanobis_vec(u, v)` or: >>> u = [ [1], [3], [5] ] >>> v = [ [2], [3] ] >>> print squared_mahalanobis(u, v) 0.425 The followed extended example shows a different approach. Here, we have a known vector of means, and we are interested in calculating the distances of different datasets to these means. Instead of using the pooled covariances of the two matrices, only the covariance of the datasets are used (the means are taken to be the truth):: #! /usr/bin/env python import random from dendropy.mathlib import linearalg from dendropy.mathlib import distance nrows = 10 ncols = 4 v1 = [] for i in range(nrows): v1.append([random.gauss(0, 10) for j in range(ncols)]) v2 = [] for i in range(nrows): v2.append([random.gauss(10, 10) for j in range(ncols)]) v3 = [] for i in range(nrows): v3.append([random.gauss(-10, 10) for j in range(ncols)]) c1 = [ [0] * 4 ] c2 = [ [10] * 4 ] c3 = [ [-10] * 4 ] v1 = linearalg.new_matrix(v1) s1 = v1.covariance_by_cols() v2 = linearalg.new_matrix(v2) s2 = v2.covariance_by_cols() v3 = linearalg.new_matrix(v3) s3 = v3.covariance_by_cols() print print "-- v1 --" print "d(c1, v1) = {}".format(distance.squared_mahalanobis(c1, v1, cov=s1)) print "d(c2, v1) = {}".format(distance.squared_mahalanobis(c2, v1, cov=s1)) print "d(c3, v1) = {}".format(distance.squared_mahalanobis(c3, v1, cov=s1)) print print "-- v2 --" print "d(c1, v2) = {}".format(distance.squared_mahalanobis(c1, v2, cov=s2)) print "d(c2, v2) = {}".format(distance.squared_mahalanobis(c2, v2, cov=s2)) print "d(c3, v2) = {}".format(distance.squared_mahalanobis(c3, v2, cov=s2)) print print "-- v3 --" print "d(c1, v3) = {}".format(distance.squared_mahalanobis(c1, v3, cov=s3)) print "d(c2, v3) = {}".format(distance.squared_mahalanobis(c2, v3, cov=s3)) print "d(c3, v3) = {}".format(distance.squared_mahalanobis(c3, v3, cov=s3)) This results in:: -- v1 -- d(c1, v1) = 0.170092378552 d(c2, v1) = 8.59447583779 d(c3, v1) = 8.898973355 -- v2 -- d(c1, v2) = 30.7463150693 d(c2, v2) = 0.434906936737 d(c3, v2) = 121.212523478 -- v3 -- d(c1, v3) = 4.19154157624 d(c2, v3) = 18.4689462227 d(c3, v3) = 0.128840873046 The `mahal` function of MATLAB calculates the Mahalanobis distance as well. Its implementation and usage are a little different: d = mahal(Y,X) computes the Mahalanobis distance (in squared units) of each observation in Y from the reference sample in matrix X. If Y is n-by-m, where n is the number of observations and m is the dimension of the data, d is n-by-1. X and Y must have the same number of columns, but can have different numbers of rows. X must have more rows than columns. For observation I, the Mahalanobis distance is defined by d(I) = (Y(I,:)-mu)*inv(SIGMA)*(Y(I,:)-mu)', where mu and SIGMA are the sample mean and covariance of the data in X. Thus, the following MATLAB code:: >> g1 = [ 2 2; 2 5; 6 5; 7 3; 4 7; 6 4; 5 3; 4 6; 2 5; 1 3; ]; >> g2 = [ 6 5; 7 4; 8 7; 5 6; 5 4; ]; >> mahal(g1, g2) can be replicated by:: #! /usr/bin/env python from dendropy.mathlib import distance from dendropy.mathlib import linearalg g1 = [ [2, 2], [2, 5], [6, 5], [7, 3], [4, 7], [6, 4], [5, 3], [4, 6], [2, 5], [1, 3], ] g2 = [ [6, 5], [7, 4], [8, 7], [5, 6], [5, 4], ] s = linearalg.new_matrix(g2).covariance_by_cols(population_variance=False) for g in g1: print distance.squared_mahalanobis([g], g2, cov=s) """ if not isinstance(u, linearalg.Matrix): u = linearalg.new_matrix(u) if not isinstance(v, linearalg.Matrix): v = linearalg.new_matrix(v) assert len(u[0]) == len(v[0]) if cov is None: cov = linearalg.pooled_covariance( u, v, population_variance=population_variance) if len(cov) == 1: cov_inv = cov else: cov_inv = cov.inverse() # TODO: column means have already been calculated to # get the covariances: avoid duplicating calcs mean_diffs = linearalg.new_matrix([u.col_means() - v.col_means()]) # `mean_diffs` is now a row vector # for consistency with the standard formulation, we should: # # mean_diffs = mean_diffs.tr() # d = mean_diffs.tr().mmul(pooled_cov_inv).mmul(mean_diffs) # # Instead ... d = mean_diffs.mmul(cov_inv).mmul(mean_diffs.tr()) return d[0][0]
def squared_mahalanobis(u, v, cov=None, population_variance=False): """ Returns the *squared* Mahalanobis distance between matrices `u` and `v`. `u` and `v` must be 2-dimensional, and have the same number of columns (though they can have different number of rows). That is, they must be a list of lists, and the lengths of the inner lists must be equal, though the length of the outer lists can be differnt). If there simple vectors of values, they should be represented as lists of multiple single-element lists, or use the convenience function `squared_mahalanobis_1d(u, v)`. `cov` is the covariance matrix. If not given, the pooled covariance of `u` and `v` are used. If the pooled covariance is calculated (i.e., `cov` is not given), then if `population_variance` is False (default), the data are treated as samples rather than a population. If only relative distances are needed (as they are in most cases), then this function should be preferred over `mahalanobis(u, v)` which has the added computational expense of taking the square root. The following examples calculate the Mahalanobis distances between matrices, using the pooled covariances:: >>> u = [ [2, 3.14, 1.3], [1, 4, 5] ] >>> v = [ [4, 1, 1], [5, 3, 2], [1, 3, 4], [1, 4, 4] ] >>> print squared_mahalanobis(u, v) 2.15009570304 >>> u = [ [2, 5], [5, 7] ] >>> v = [ [1, 5], [4, 8] ] >>> print squared_mahalanobis(u, v) 28.8888888889 >>> # for single column, you can use ... `squared_mahalanobis_vec(u, v)` or: >>> u = [ [1], [3], [5] ] >>> v = [ [2], [3] ] >>> print squared_mahalanobis(u, v) 0.425 The followed extended example shows a different approach. Here, we have a known vector of means, and we are interested in calculating the distances of different datasets to these means. Instead of using the pooled covariances of the two matrices, only the covariance of the datasets are used (the means are taken to be the truth):: #! /usr/bin/env python import random from dendropy.mathlib import linearalg from dendropy.mathlib import distance nrows = 10 ncols = 4 v1 = [] for i in range(nrows): v1.append([random.gauss(0, 10) for j in range(ncols)]) v2 = [] for i in range(nrows): v2.append([random.gauss(10, 10) for j in range(ncols)]) v3 = [] for i in range(nrows): v3.append([random.gauss(-10, 10) for j in range(ncols)]) c1 = [ [0] * 4 ] c2 = [ [10] * 4 ] c3 = [ [-10] * 4 ] v1 = linearalg.new_matrix(v1) s1 = v1.covariance_by_cols() v2 = linearalg.new_matrix(v2) s2 = v2.covariance_by_cols() v3 = linearalg.new_matrix(v3) s3 = v3.covariance_by_cols() print print "-- v1 --" print "d(c1, v1) = {}".format(distance.squared_mahalanobis(c1, v1, cov=s1)) print "d(c2, v1) = {}".format(distance.squared_mahalanobis(c2, v1, cov=s1)) print "d(c3, v1) = {}".format(distance.squared_mahalanobis(c3, v1, cov=s1)) print print "-- v2 --" print "d(c1, v2) = {}".format(distance.squared_mahalanobis(c1, v2, cov=s2)) print "d(c2, v2) = {}".format(distance.squared_mahalanobis(c2, v2, cov=s2)) print "d(c3, v2) = {}".format(distance.squared_mahalanobis(c3, v2, cov=s2)) print print "-- v3 --" print "d(c1, v3) = {}".format(distance.squared_mahalanobis(c1, v3, cov=s3)) print "d(c2, v3) = {}".format(distance.squared_mahalanobis(c2, v3, cov=s3)) print "d(c3, v3) = {}".format(distance.squared_mahalanobis(c3, v3, cov=s3)) This results in:: -- v1 -- d(c1, v1) = 0.170092378552 d(c2, v1) = 8.59447583779 d(c3, v1) = 8.898973355 -- v2 -- d(c1, v2) = 30.7463150693 d(c2, v2) = 0.434906936737 d(c3, v2) = 121.212523478 -- v3 -- d(c1, v3) = 4.19154157624 d(c2, v3) = 18.4689462227 d(c3, v3) = 0.128840873046 The `mahal` function of MATLAB calculates the Mahalanobis distance as well. Its implementation and usage are a little different: d = mahal(Y,X) computes the Mahalanobis distance (in squared units) of each observation in Y from the reference sample in matrix X. If Y is n-by-m, where n is the number of observations and m is the dimension of the data, d is n-by-1. X and Y must have the same number of columns, but can have different numbers of rows. X must have more rows than columns. For observation I, the Mahalanobis distance is defined by d(I) = (Y(I,:)-mu)*inv(SIGMA)*(Y(I,:)-mu)', where mu and SIGMA are the sample mean and covariance of the data in X. Thus, the following MATLAB code:: >> g1 = [ 2 2; 2 5; 6 5; 7 3; 4 7; 6 4; 5 3; 4 6; 2 5; 1 3; ]; >> g2 = [ 6 5; 7 4; 8 7; 5 6; 5 4; ]; >> mahal(g1, g2) can be replicated by:: #! /usr/bin/env python from dendropy.mathlib import distance from dendropy.mathlib import linearalg g1 = [ [2, 2], [2, 5], [6, 5], [7, 3], [4, 7], [6, 4], [5, 3], [4, 6], [2, 5], [1, 3], ] g2 = [ [6, 5], [7, 4], [8, 7], [5, 6], [5, 4], ] s = linearalg.new_matrix(g2).covariance_by_cols(population_variance=False) for g in g1: print distance.squared_mahalanobis([g], g2, cov=s) """ if not isinstance(u, linearalg.Matrix): u = linearalg.new_matrix(u) if not isinstance(v, linearalg.Matrix): v = linearalg.new_matrix(v) assert len(u[0]) == len(v[0]) if cov is None: cov = linearalg.pooled_covariance(u, v, population_variance=population_variance) if len(cov) == 1: cov_inv = cov else: cov_inv = cov.inverse() # TODO: column means have already been calculated to # get the covariances: avoid duplicating calcs mean_diffs = linearalg.new_matrix([u.col_means() - v.col_means()]) # `mean_diffs` is now a row vector # for consistency with the standard formulation, we should: # # mean_diffs = mean_diffs.tr() # d = mean_diffs.tr().mmul(pooled_cov_inv).mmul(mean_diffs) # # Instead ... d = mean_diffs.mmul(cov_inv).mmul(mean_diffs.tr()) return d[0][0]