def _correlate_matrix(matrix, baits, percentiles): matrix_df = matrix.data # Remove rows with no variance as correlation functions yield nan for it # # Note: we only drop the absolutely necessary so that the user can # choose how to clean the expression matrices instead of the algorithm # doing it for them tiny_stds = matrix_df.std(axis=1) < np.finfo(float).tiny rows_dropped = sum(tiny_stds) if rows_dropped: matrix_df = matrix_df[~tiny_stds] logging.warning( join_lines(f''' Dropped {rows_dropped} out of {len(matrix_df)+rows_dropped} rows from {matrix} due to having (near) 0 standard deviation. These rows have a NaN correlation with any other row. ''')) if matrix_df.empty: raise ValueError( join_lines(f''' After dropping rows with tiny standard deviation, {matrix} has no rows. Please check the expression matrix for errors or drop it from the input. ''')) # Get cutoffs sample, cutoffs = _estimate_cutoffs(matrix, percentiles) cutoffs = tuple(cutoffs) lower_cutoff, upper_cutoff = cutoffs # Correlation matrix present_baits = matrix_df.reindex(baits).dropna() cors = pearson_df(matrix_df, present_baits) cor_matrix = cors # Cutoff and reformat to relational (DB) format cors = cors[(cors <= lower_cutoff) | (cors >= upper_cutoff)] cors.index.name = None cors = cors.reset_index() cors = cors.rename(columns={'index': 'gene'}) cors = pd.melt(cors, id_vars=['gene'], var_name='bait', value_name='correlation') cors = cors.dropna(subset=['correlation']) return cors, ExpressionMatrixInfo(matrix, sample, cutoffs, cor_matrix)
def __init__(self, rgb): self._rgb = np.array(rgb) if ((self._rgb < 0) | (self._rgb > 255)).any(): raise ValueError( f'Invalid colour component value(s). Given rgb: {self._rgb}') if self._rgb.dtype != int: raise ValueError( join_lines(f''' Colour component value(s) must be int. Given values have type {self._rgb.dtype} '''))
def _estimate_cutoffs(matrix, percentiles): ''' Estimate upper and lower correlation cutoffs Takes the x-th and y-th percentile of a sample similarity matrix of `matrix`, returning these as the lower and upper cut-off respectively. Using a sample as calculating all correlations is n**2. Sample size is chosen to be easy enough to calculate; for a large matrix our estimate is less accurate than for a small matrix. We could report a confidence interval for the cut-off estimate if that bothers us or more likely delve into statistics to find a better cut-off. In practice the user just plays with the percentiles until the result is what they want; so this is good enough. ''' matrix_df = matrix.data # Take a sample unless it's a tiny matrix if len(matrix_df) <= 800: sample = matrix_df else: sample_size = 800 sample = np.random.choice(len(matrix_df), sample_size, replace=False) sample = matrix_df.iloc[sample] sample = sample.sort_index() # for prettier output later cors = pearson_df(sample, sample) # Get the upper triangle as 1D array, excluding the diagonal. # # The diagonal is pearson(x, x) == 1, so we ignore that. The matrix is # symmetric as pearson(x, y) == pearson(y, x); so we only need to look at # the upper triangle. triu = cors.values[np.triu_indices(len(cors), 1)] # Warn if >10% NaN nan_count = np.isnan(triu).sum() if nan_count > triu.size * .1: logging.warning( join_lines(f''' Correlation sample of {matrix} contains more than 10% NaN values, specifically {nan_count}/{triu.size} correlations are NaN (only including non-diagonal upper triangle correlation matrix values). ''')) # Ignore NaN values when calculating percentiles triu = triu[~np.isnan(triu)] cutoffs = np.percentile(triu, percentiles) return cors, cutoffs
def from_float(rgb): ''' Create RGB from float array-like. Parameters ---------- rgb : ~pytil.numpy.ArrayLike[float] ``(red, green, blue)`` array with values between 0 and 1. ''' rgb = np.array(rgb) if ((rgb < 0.0) | (rgb > 1.0)).any(): raise ValueError( join_lines(f''' Invalid component value(s), should be float in range of [0, 1]. Given rgb: {rgb} ''')) return RGB((rgb * 255).round().astype(int))
def _parse_percentiles(args): lower_percentile = args['lower_percentile'] upper_percentile = args['upper_percentile'] if lower_percentile < 0: raise UserError( f'Lower percentile must be at least 0. Got: {lower_percentile}' ) if upper_percentile > 100: raise UserError( f'Upper percentile must be at most 100. Got: {upper_percentile}' ) if lower_percentile > upper_percentile: raise ValueError(join_lines( f''' Lower percentile must be less or equal to upper percentile, got: {lower_percentile}, {upper_percentile} ''' )) return np.array([lower_percentile, upper_percentile])
def _validate_matrices(baits, matrices): if not matrices: raise UserError(join_lines( f''' Must provide at least one expression matrix, got: {matrices} ''' )) names = [matrix.name for matrix in matrices] if len(matrices) != len(set(names)): raise UserError( f'Expression matrices must have unique name, got: {sorted(names)}' ) # Check each bait occurs in exactly one matrix bait_presence = np.array([ bait in matrix.data.index for matrix, bait in product(matrices, baits) ]) bait_presence = bait_presence.reshape(len(matrices), len(baits)) missing_bait_matrix = pd.DataFrame( bait_presence, index=matrices, columns=baits ) missing_bait_matrix = missing_bait_matrix.loc[:,bait_presence.sum(axis=0) != 1] # pylint: disable=trailing-whitespace if not missing_bait_matrix.empty: missing_bait_matrix = missing_bait_matrix.applymap(lambda x: 'present' if x else 'absent') missing_bait_matrix.index = missing_bait_matrix.index.map(lambda matrix: matrix.name) missing_bait_matrix.index.name = 'Matrix name' missing_bait_matrix.columns.name = 'Gene name' raise UserError(dedent( f'''\ Each of the following baits is either missing from all or present in multiple expression matrices: {missing_bait_matrix.to_string()} Missing baits are columns with no "present" value, while baits in multiple matrices have multiple "present" values in a column.''' )) # and each matrix has at least one bait is_baitless = bait_presence.sum(axis=1) == 0 if is_baitless.any(): matrices = np.array(matrices)[is_baitless] matrices = ', '.join(map(str, matrices)) raise UserError(join_lines( f''' Some expression matrices have no baits: {matrices}. Each expression matrix must contain at least one bait. Either drop the matrices or add some of their genes to the baits list. ''' )) # Check the matrices don't overlap (same gene in multiple matrices) all_genes = pd.Series(sum((list(matrix.data.index) for matrix in matrices), [])) overlapping_genes = all_genes[all_genes.duplicated()] if not overlapping_genes.empty: raise UserError(join_lines( f''' The following genes appear in multiple expression matrices: {', '.join(overlapping_genes)}. CoExpNetViz does not support gene expression data from different matrices for the same gene. Please remove rows from the given matrices such that no gene appears in multiple matrices. ''' ))