def test_quantile_normalize(self): row_scores = read_matrix('testdata/rowscores_fixed.tsv') mot_scores = read_matrix('testdata/motscores_fixed.tsv') net_scores = read_matrix('testdata/netscores_fixed.tsv') ref_rowscores = read_matrix('testdata/rowscores_qnorm.tsv') ref_motscores = read_matrix('testdata/motscores_qnorm.tsv') ref_netscores = read_matrix('testdata/netscores_qnorm.tsv') in_matrices = [row_scores, mot_scores, net_scores] # scaling for cluster 49 scalings = [6.0, 0.033355570380253496, 0.016677785190126748] result = dm.quantile_normalize_scores(in_matrices, scalings) self.assertTrue(check_matrix_values(result[0], ref_rowscores)) self.assertTrue(check_matrix_values(result[1], ref_motscores)) self.assertTrue(check_matrix_values(result[2], ref_netscores))
def test_quantile_normalize_scores_with_undefined_weight(self): """one undefined weight""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan]) outmatrix1 = result[0].values self.assertAlmostEqual(1.0, outmatrix1[0][0]) self.assertAlmostEqual(3.0, outmatrix1[0][1]) self.assertAlmostEqual(2.0, outmatrix1[1][0]) self.assertAlmostEqual(4.0, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(2.0, outmatrix2[0][0]) self.assertAlmostEqual(4.0, outmatrix2[0][1]) self.assertAlmostEqual(1.0, outmatrix2[1][0]) self.assertAlmostEqual(3.0, outmatrix2[1][1])
def test_quantile_normalize_scores_with_no_weights(self): """no weights -> fall back to row means""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], None) outmatrix1 = result[0].values self.assertAlmostEqual(1.55, outmatrix1[0][0]) self.assertAlmostEqual(2.655, outmatrix1[0][1]) self.assertAlmostEqual(2.15, outmatrix1[1][0]) self.assertAlmostEqual(3.25, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(2.15, outmatrix2[0][0]) self.assertAlmostEqual(3.25, outmatrix2[0][1]) self.assertAlmostEqual(1.55, outmatrix2[1][0]) self.assertAlmostEqual(2.655, outmatrix2[1][1])
def test_quantile_normalize_scores_with_all_defined_weights(self): """happy path for quantile normalization""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0]) outmatrix1 = result[0].values self.assertAlmostEqual(0.5785714, outmatrix1[0][0]) self.assertAlmostEqual(1.45071428, outmatrix1[0][1]) self.assertAlmostEqual(1.02142857, outmatrix1[1][0]) self.assertAlmostEqual(1.89285714, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(1.02142857, outmatrix2[0][0]) self.assertAlmostEqual(1.89285714, outmatrix2[0][1]) self.assertAlmostEqual(0.5785714, outmatrix2[1][0]) self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
def test_quantile_normalize_scores_with_undefined_weight(self): """one undefined weight""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan]) outmatrix1 = result[0] self.assertAlmostEqual(1.0, outmatrix1[0][0]) self.assertAlmostEqual(3.0, outmatrix1[0][1]) self.assertAlmostEqual(2.0, outmatrix1[1][0]) self.assertAlmostEqual(4.0, outmatrix1[1][1]) outmatrix2 = result[1] self.assertAlmostEqual(2.0, outmatrix2[0][0]) self.assertAlmostEqual(4.0, outmatrix2[0][1]) self.assertAlmostEqual(1.0, outmatrix2[1][0]) self.assertAlmostEqual(3.0, outmatrix2[1][1])
def test_quantile_normalize_scores_with_no_weights(self): """no weights -> fall back to row means""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], None) outmatrix1 = result[0] self.assertAlmostEqual(1.55, outmatrix1[0][0]) self.assertAlmostEqual(2.655, outmatrix1[0][1]) self.assertAlmostEqual(2.15, outmatrix1[1][0]) self.assertAlmostEqual(3.25, outmatrix1[1][1]) outmatrix2 = result[1] self.assertAlmostEqual(2.15, outmatrix2[0][0]) self.assertAlmostEqual(3.25, outmatrix2[0][1]) self.assertAlmostEqual(1.55, outmatrix2[1][0]) self.assertAlmostEqual(2.655, outmatrix2[1][1])
def test_quantile_normalize_scores_with_all_defined_weights(self): """happy path for quantile normalization""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0]) outmatrix1 = result[0] self.assertAlmostEqual(0.5785714, outmatrix1[0][0]) self.assertAlmostEqual(1.45071428, outmatrix1[0][1]) self.assertAlmostEqual(1.02142857, outmatrix1[1][0]) self.assertAlmostEqual(1.89285714, outmatrix1[1][1]) outmatrix2 = result[1] self.assertAlmostEqual(1.02142857, outmatrix2[0][0]) self.assertAlmostEqual(1.89285714, outmatrix2[0][1]) self.assertAlmostEqual(0.5785714, outmatrix2[1][0]) self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
def compute(self, iteration_result, ref_matrix=None): """compute scores for one iteration""" result_matrices = [] score_scalings = [] reference_matrix = ref_matrix iteration = iteration_result['iteration'] for scoring_function in self.__scoring_functions: # This is actually a hack in order to propagate # a reference matrix to the compute function # This could have negative impact on scalability if reference_matrix == None and len(result_matrices) > 0: reference_matrix = result_matrices[0] matrix = scoring_function.compute(iteration_result, reference_matrix) if matrix != None: result_matrices.append(matrix) score_scalings.append(scoring_function.scaling(iteration)) if self.__log_subresults: self.__log_subresult(scoring_function, matrix) if len(result_matrices) > 1: #logging.info( # "\x1b[31mScoring:\t\x1b[0mCOMBINING THE SCORES OF %d matrices (quantile normalize)", # len(result_matrices)) start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time #logging.info("\x1b[31mScoring:\t\x1b[0mSCORES COMBINED IN %f s", elapsed / 1000.0) if len(result_matrices) > 0: combined_score = (result_matrices[0] * self.__scoring_functions[0].scaling(iteration)) for index in xrange(1, len(result_matrices)): combined_score += ( result_matrices[index] * self.__scoring_functions[index].scaling(iteration)) return combined_score else: return None
def __combine(self, result_matrices, score_scalings, iteration): if len(result_matrices) > 1 and self.__config_params['quantile_normalize']: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) if len(result_matrices) > 0: start_time = util.current_millis() combined_score = (result_matrices[0] * self.__scoring_functions[0].scaling(iteration)) for index in xrange(1, len(result_matrices)): combined_score += ( result_matrices[index] * self.__scoring_functions[index].scaling(iteration)) elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) return combined_score else: return None
def __combine(self, result_matrices, score_scalings, iteration): if len(result_matrices) > 1 and self.__config_params['quantile_normalize']: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) if len(result_matrices) > 0: matrix0 = result_matrices[0] start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(matrix0.values.shape) for i in xrange(len(result_matrices)): combined_score += (result_matrices[i].values * self.__scoring_functions[i].scaling(iteration)) elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def combine(result_matrices, score_scalings, membership, quantile_normalize): """This is the combining function, taking n result matrices and scalings""" for m in result_matrices: m.fix_extreme_values() if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row]][cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.info("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) #print "qqq(%d) = %f" % (i, qqq) if qqq == 0: logging.error("very sparse score !!!") values = values / qqq * abs(rs_quant) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args']['functions'] m.write_tsv_file(os.path.join(config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.warn('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None