def center_scale(row): """centers the provided row around the median""" filtered = row[np.isfinite(row)] center = scipy.median(filtered) scale = util.r_stddev(filtered) nurow = [((value - center) / scale) if not np.isnan(value) else value for value in row] return nurow
def center_scale_filter(matrix, group_columns, group_controls): """center the values of each row around their median and scale by their standard deviation. This is a specialized version""" centers = [scipy.median([matrix[row][col] for col in group_controls]) for row in range(matrix.num_rows())] scale_factors = [util.r_stddev([matrix[row][col] for col in group_columns]) for row in range(matrix.num_rows())] for row in range(matrix.num_rows()): for col in group_columns: matrix[row][col] -= centers[row] matrix[row][col] /= scale_factors[row] return matrix
def coeff_var(row_values): """computes the coefficient of variation""" sigma = util.r_stddev(row_values) mu = util.mean(row_values) return sigma / mu
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def test_r_stddev_with_nan(self): """tests the standard deviation function""" self.assertEquals(0.1, util.r_stddev([0.1, 0.2, 0.3, np.nan]))
def combine(result_matrices, score_scalings, membership, quantile_normalize): """This is the combining function, taking n result matrices and scalings""" for m in result_matrices: m.fix_extreme_values() if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row]][cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.info("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) #print "qqq(%d) = %f" % (i, qqq) if qqq == 0: logging.error("very sparse score !!!") values = values / qqq * abs(rs_quant) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args']['functions'] m.write_tsv_file(os.path.join(config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.warn('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None