Ejemplo n.º 1
0
 def test_column_means(self):
     """tests the column_means() function"""
     matrix = [[0.0010, 0.1234, 0.21370, 0.0342],
               [0.2123, -0.2135, -0.99980, -0.0213],
               [-0.4534, 0.5546, 0.79123, 0.00312321]]
     result = util.column_means(matrix)
     self.assertAlmostEqual(-0.08003333, result[0])
     self.assertAlmostEqual(0.15483333, result[1])
     self.assertAlmostEqual(0.00171, result[2])
     self.assertAlmostEqual(0.00534107, result[3])
Ejemplo n.º 2
0
 def test_column_means_with_nans(self):
     """tests the column_means() function, containing NaNs"""
     matrix = [[0.0010, 0.1234, 0.21370, np.nan],
               [0.2123, np.nan, -0.99980, -0.0213],
               [np.nan, 0.5546, 0.79123, 0.00312321]]
     result = util.column_means(matrix)
     self.assertAlmostEqual(0.10664999, result[0])
     self.assertAlmostEqual(0.33899999, result[1])
     self.assertAlmostEqual(0.00171, result[2])
     self.assertAlmostEqual(-0.00908839499, result[3])
Ejemplo n.º 3
0
def compute_column_scores_submatrix(matrix):
    """For a given matrix, compute the column scores.
    This is used to compute the column scores of the sub matrices that
    were determined by the pre-seeding, so typically, matrix is a
    submatrix of the input matrix that contains only the rows that
    belong to a certain cluster.
    The result is a DataMatrix with one row containing all the
    column scores

    This function normalizes diff^2 by the mean expression level, similar
    to "Index of Dispersion", see
    http://en.wikipedia.org/wiki/Index_of_dispersion
    for details
    """
    if matrix is None:
        return None
    colmeans = util.column_means(matrix.values)
    matrix_minus_colmeans_squared = np.square(matrix.values - colmeans)
    var_norm = np.abs(colmeans) + 0.01
    result = util.column_means(matrix_minus_colmeans_squared) / var_norm
    return (matrix.column_names, result)
Ejemplo n.º 4
0
def __compute_row_scores_for_submatrix(matrix, submatrix):
    """For a given matrix, compute the row scores. The second submatrix is
    used to calculate the column means on and should be derived from
    datamatrix filtered by the row names and column names of a specific
    cluster.
    matrix should be filtered by the columns of a specific cluster in
    order for the column means to be applied properly.
    The result is a DataMatrix with one row containing all the row scores"""
    rm = util.row_means(np.square(matrix.values - util.column_means(submatrix.values)))
    # we clip the values to make sure the argument to log will be
    # sufficiently above 0 to avoid errors
    return np.log(np.clip(rm, 1e-20, 1000.0) + 1e-99)
Ejemplo n.º 5
0
def compute_column_scores_submatrix(matrix):
    """For a given matrix, compute the column scores.
    This is used to compute the column scores of the sub matrices that
    were determined by the pre-seeding, so typically, matrix is a
    submatrix of the input matrix that contains only the rows that
    belong to a certain cluster.
    The result is a DataMatrix with one row containing all the
    column scores

    This function normalizes diff^2 by the mean expression level, similar
    to "Index of Dispersion", see
    http://en.wikipedia.org/wiki/Index_of_dispersion
    for details
    """
    if matrix is None:
        return None
    colmeans = util.column_means(matrix.values)
    matrix_minus_colmeans_squared = np.square(matrix.values - colmeans)
    var_norm = np.abs(colmeans) + 0.01
    result = util.column_means(matrix_minus_colmeans_squared) / var_norm
    return (matrix.column_names, result)
Ejemplo n.º 6
0
def __compute_row_scores_for_submatrix(matrix, submatrix):
    """For a given matrix, compute the row scores. The second submatrix is
    used to calculate the column means on and should be derived from
    datamatrix filtered by the row names and column names of a specific
    cluster.
    matrix should be filtered by the columns of a specific cluster in
    order for the column means to be applied properly.
    The result is a DataMatrix with one row containing all the row scores"""
    rm = util.row_means(
        np.square(matrix.values - util.column_means(submatrix.values)))
    # we clip the values to make sure the argument to log will be
    # sufficiently above 0 to avoid errors
    return np.log(np.clip(rm, 1e-20, 1000.0) + 1e-99)
Ejemplo n.º 7
0
 def residual(self, max_row_variance=None):
     """computes the residual for this matrix, if max_row_variance is given,
     result is normalized by the row variance"""
     d_rows = util.row_means(self.values)
     d_cols = util.column_means(self.values)
     d_all = util.mean(d_rows)
     tmp = self.values + d_all - util.r_outer(d_rows, d_cols, operator.add)
     average = util.mean(np.abs(tmp))
     if max_row_variance is not None:
         row_var = self.row_variance()
         if np.isnan(row_var) or row_var > max_row_variance:
             row_var = max_row_variance
         average = average / row_var
     return average
Ejemplo n.º 8
0
 def residual(self, max_row_variance=None):
     """computes the residual for this matrix, if max_row_variance is given,
     result is normalized by the row variance"""
     d_rows = util.row_means(self.values)
     d_cols = util.column_means(self.values)
     d_all = util.mean(d_rows)
     tmp = self.values + d_all - util.r_outer(d_rows, d_cols, operator.add)
     average = util.mean(np.abs(tmp))
     if max_row_variance is not None:
         row_var = self.row_variance()
         if np.isnan(row_var) or row_var > max_row_variance:
             row_var = max_row_variance
         average = average / row_var
     return average
Ejemplo n.º 9
0
 def column_means(self):
     """Returns a numpy array, containing the column means"""
     return util.column_means(self.values)