def test_column_means_with_nans(self): """tests the column_means() function, containing NaNs""" matrix = [[0.0010, 0.1234, 0.21370, np.nan], [0.2123, np.nan, -0.99980, -0.0213], [np.nan, 0.5546, 0.79123, 0.00312321]] result = util.column_means(matrix) self.assertAlmostEqual(0.10664999, result[0]) self.assertAlmostEqual(0.33899999, result[1]) self.assertAlmostEqual(0.00171, result[2]) self.assertAlmostEqual(-0.00908839499, result[3])
def test_column_means(self): """tests the column_means() function""" matrix = [[0.0010, 0.1234, 0.21370, 0.0342], [0.2123, -0.2135, -0.99980, -0.0213], [-0.4534, 0.5546, 0.79123, 0.00312321]] result = util.column_means(matrix) self.assertAlmostEqual(-0.08003333, result[0]) self.assertAlmostEqual(0.15483333, result[1]) self.assertAlmostEqual(0.00171, result[2]) self.assertAlmostEqual(0.00534107, result[3])
def compute_column_scores_submatrix(matrix): """For a given matrix, compute the column scores. This is used to compute the column scores of the sub matrices that were determined by the pre-seeding, so typically, matrix is a submatrix of the input matrix that contains only the rows that belong to a certain cluster. The result is a DataMatrix with one row containing all the column scores This function normalizes diff^2 by the mean expression level, similar to "Index of Dispersion", see http://en.wikipedia.org/wiki/Index_of_dispersion for details """ if matrix is None: return None colmeans = util.column_means(matrix.values) matrix_minus_colmeans_squared = np.square(matrix.values - colmeans) var_norm = np.abs(colmeans) + 0.01 result = util.column_means(matrix_minus_colmeans_squared) / var_norm return (matrix.column_names, result)
def __compute_row_scores_for_submatrix(matrix, submatrix): """For a given matrix, compute the row scores. The second submatrix is used to calculate the column means on and should be derived from datamatrix filtered by the row names and column names of a specific cluster. matrix should be filtered by the columns of a specific cluster in order for the column means to be applied properly. The result is a DataMatrix with one row containing all the row scores""" rm = util.row_means(np.square(matrix.values - util.column_means(submatrix.values))) # we clip the values to make sure the argument to log will be # sufficiently above 0 to avoid errors return np.log(np.clip(rm, 1e-20, 1000.0) + 1e-99)
def __compute_row_scores_for_submatrix(matrix, submatrix): """For a given matrix, compute the row scores. The second submatrix is used to calculate the column means on and should be derived from datamatrix filtered by the row names and column names of a specific cluster. matrix should be filtered by the columns of a specific cluster in order for the column means to be applied properly. The result is a DataMatrix with one row containing all the row scores""" rm = util.row_means( np.square(matrix.values - util.column_means(submatrix.values))) # we clip the values to make sure the argument to log will be # sufficiently above 0 to avoid errors return np.log(np.clip(rm, 1e-20, 1000.0) + 1e-99)
def residual(self, max_row_variance=None): """computes the residual for this matrix, if max_row_variance is given, result is normalized by the row variance""" d_rows = util.row_means(self.values) d_cols = util.column_means(self.values) d_all = util.mean(d_rows) tmp = self.values + d_all - util.r_outer(d_rows, d_cols, operator.add) average = util.mean(np.abs(tmp)) if max_row_variance is not None: row_var = self.row_variance() if np.isnan(row_var) or row_var > max_row_variance: row_var = max_row_variance average = average / row_var return average