コード例 #1
0
def _correlate_matrix(matrix, baits, percentiles):
    matrix_df = matrix.data

    # Remove rows with no variance as correlation functions yield nan for it
    #
    # Note: we only drop the absolutely necessary so that the user can
    # choose how to clean the expression matrices instead of the algorithm
    # doing it for them
    tiny_stds = matrix_df.std(axis=1) < np.finfo(float).tiny
    rows_dropped = sum(tiny_stds)
    if rows_dropped:
        matrix_df = matrix_df[~tiny_stds]
        logging.warning(
            join_lines(f'''
            Dropped {rows_dropped} out of {len(matrix_df)+rows_dropped} rows
            from {matrix} due to having (near) 0 standard deviation. These rows
            have a NaN correlation with any other row.
            '''))
        if matrix_df.empty:
            raise ValueError(
                join_lines(f'''
                After dropping rows with tiny standard deviation, {matrix} has
                no rows. Please check the expression matrix for errors or drop
                it from the input.
                '''))

    # Get cutoffs
    sample, cutoffs = _estimate_cutoffs(matrix, percentiles)
    cutoffs = tuple(cutoffs)
    lower_cutoff, upper_cutoff = cutoffs

    # Correlation matrix
    present_baits = matrix_df.reindex(baits).dropna()
    cors = pearson_df(matrix_df, present_baits)
    cor_matrix = cors

    # Cutoff and reformat to relational (DB) format
    cors = cors[(cors <= lower_cutoff) | (cors >= upper_cutoff)]
    cors.index.name = None
    cors = cors.reset_index()
    cors = cors.rename(columns={'index': 'gene'})
    cors = pd.melt(cors,
                   id_vars=['gene'],
                   var_name='bait',
                   value_name='correlation')
    cors = cors.dropna(subset=['correlation'])

    return cors, ExpressionMatrixInfo(matrix, sample, cutoffs, cor_matrix)
コード例 #2
0
 def __init__(self, rgb):
     self._rgb = np.array(rgb)
     if ((self._rgb < 0) | (self._rgb > 255)).any():
         raise ValueError(
             f'Invalid colour component value(s). Given rgb: {self._rgb}')
     if self._rgb.dtype != int:
         raise ValueError(
             join_lines(f'''
             Colour component value(s) must be int. Given values have type
             {self._rgb.dtype}
             '''))
コード例 #3
0
def _estimate_cutoffs(matrix, percentiles):
    '''
    Estimate upper and lower correlation cutoffs

    Takes the x-th and y-th percentile of a sample similarity matrix of
    `matrix`, returning these as the lower and upper cut-off respectively.

    Using a sample as calculating all correlations is n**2. Sample size is
    chosen to be easy enough to calculate; for a large matrix our estimate
    is less accurate than for a small matrix. We could report a confidence
    interval for the cut-off estimate if that bothers us or more likely delve
    into statistics to find a better cut-off. In practice the user just plays
    with the percentiles until the result is what they want; so this is good
    enough.
    '''
    matrix_df = matrix.data

    # Take a sample unless it's a tiny matrix
    if len(matrix_df) <= 800:
        sample = matrix_df
    else:
        sample_size = 800
        sample = np.random.choice(len(matrix_df), sample_size, replace=False)
        sample = matrix_df.iloc[sample]

    sample = sample.sort_index()  # for prettier output later
    cors = pearson_df(sample, sample)

    # Get the upper triangle as 1D array, excluding the diagonal.
    #
    # The diagonal is pearson(x, x) == 1, so we ignore that. The matrix is
    # symmetric as pearson(x, y) == pearson(y, x); so we only need to look at
    # the upper triangle.
    triu = cors.values[np.triu_indices(len(cors), 1)]

    # Warn if >10% NaN
    nan_count = np.isnan(triu).sum()
    if nan_count > triu.size * .1:
        logging.warning(
            join_lines(f'''
            Correlation sample of {matrix} contains more than 10% NaN values,
            specifically {nan_count}/{triu.size} correlations are NaN (only
            including non-diagonal upper triangle correlation matrix values).
            '''))

    # Ignore NaN values when calculating percentiles
    triu = triu[~np.isnan(triu)]
    cutoffs = np.percentile(triu, percentiles)

    return cors, cutoffs
コード例 #4
0
    def from_float(rgb):
        '''
        Create RGB from float array-like.

        Parameters
        ----------
        rgb : ~pytil.numpy.ArrayLike[float]
            ``(red, green, blue)`` array with values between 0 and 1.
        '''
        rgb = np.array(rgb)
        if ((rgb < 0.0) | (rgb > 1.0)).any():
            raise ValueError(
                join_lines(f'''
                Invalid component value(s), should be float in range of [0, 1].
                Given rgb: {rgb}
                '''))
        return RGB((rgb * 255).round().astype(int))
コード例 #5
0
def _parse_percentiles(args):
    lower_percentile = args['lower_percentile']
    upper_percentile = args['upper_percentile']
    if lower_percentile < 0:
        raise UserError(
            f'Lower percentile must be at least 0. Got: {lower_percentile}'
        )
    if upper_percentile > 100:
        raise UserError(
            f'Upper percentile must be at most 100. Got: {upper_percentile}'
        )
    if lower_percentile > upper_percentile:
        raise ValueError(join_lines(
            f'''
            Lower percentile must be less or equal to upper percentile, got:
            {lower_percentile}, {upper_percentile}
            '''
        ))
    return np.array([lower_percentile, upper_percentile])
コード例 #6
0
def _validate_matrices(baits, matrices):
    if not matrices:
        raise UserError(join_lines(
            f'''
            Must provide at least one expression matrix, got:
            {matrices}
            '''
        ))
    names = [matrix.name for matrix in matrices]
    if len(matrices) != len(set(names)):
        raise UserError(
            f'Expression matrices must have unique name, got: {sorted(names)}'
        )

    # Check each bait occurs in exactly one matrix
    bait_presence = np.array([
        bait in matrix.data.index
        for matrix, bait in product(matrices, baits)
    ])
    bait_presence = bait_presence.reshape(len(matrices), len(baits))
    missing_bait_matrix = pd.DataFrame(
        bait_presence,
        index=matrices,
        columns=baits
    )
    missing_bait_matrix = missing_bait_matrix.loc[:,bait_presence.sum(axis=0) != 1]
    # pylint: disable=trailing-whitespace
    if not missing_bait_matrix.empty:
        missing_bait_matrix = missing_bait_matrix.applymap(lambda x: 'present' if x else 'absent')
        missing_bait_matrix.index = missing_bait_matrix.index.map(lambda matrix: matrix.name)
        missing_bait_matrix.index.name = 'Matrix name'
        missing_bait_matrix.columns.name = 'Gene name'
        raise UserError(dedent(
            f'''\
            Each of the following baits is either missing from all or present in
            multiple expression matrices:
            
            {missing_bait_matrix.to_string()}
            
            Missing baits are columns with no "present" value, while baits in
            multiple matrices have multiple "present" values in a column.'''
        ))

    # and each matrix has at least one bait
    is_baitless = bait_presence.sum(axis=1) == 0
    if is_baitless.any():
        matrices = np.array(matrices)[is_baitless]
        matrices = ', '.join(map(str, matrices))
        raise UserError(join_lines(
            f'''
            Some expression matrices have no baits: {matrices}. Each expression
            matrix must contain at least one bait. Either drop the matrices or
            add some of their genes to the baits list.
            '''
        ))

    # Check the matrices don't overlap (same gene in multiple matrices)
    all_genes = pd.Series(sum((list(matrix.data.index) for matrix in matrices), []))
    overlapping_genes = all_genes[all_genes.duplicated()]
    if not overlapping_genes.empty:
        raise UserError(join_lines(
            f'''
            The following genes appear in multiple expression matrices:
            {', '.join(overlapping_genes)}. CoExpNetViz does not support gene
            expression data from different matrices for the same gene. Please
            remove rows from the given matrices such that no gene appears in
            multiple matrices.
            '''
        ))