コード例 #1
def _correlate_matrix(matrix, baits, percentiles):
    matrix_df = matrix.data

    # Remove rows with no variance as correlation functions yield nan for it
    # Note: we only drop the absolutely necessary so that the user can
    # choose how to clean the expression matrices instead of the algorithm
    # doing it for them
    tiny_stds = matrix_df.std(axis=1) < np.finfo(float).tiny
    rows_dropped = sum(tiny_stds)
    if rows_dropped:
        matrix_df = matrix_df[~tiny_stds]
            Dropped {rows_dropped} out of {len(matrix_df)+rows_dropped} rows
            from {matrix} due to having (near) 0 standard deviation. These rows
            have a NaN correlation with any other row.
        if matrix_df.empty:
            raise ValueError(
                After dropping rows with tiny standard deviation, {matrix} has
                no rows. Please check the expression matrix for errors or drop
                it from the input.

    # Get cutoffs
    sample, cutoffs = _estimate_cutoffs(matrix, percentiles)
    cutoffs = tuple(cutoffs)
    lower_cutoff, upper_cutoff = cutoffs

    # Correlation matrix
    present_baits = matrix_df.reindex(baits).dropna()
    cors = pearson_df(matrix_df, present_baits)
    cor_matrix = cors

    # Cutoff and reformat to relational (DB) format
    cors = cors[(cors <= lower_cutoff) | (cors >= upper_cutoff)]
    cors.index.name = None
    cors = cors.reset_index()
    cors = cors.rename(columns={'index': 'gene'})
    cors = pd.melt(cors,
    cors = cors.dropna(subset=['correlation'])

    return cors, ExpressionMatrixInfo(matrix, sample, cutoffs, cor_matrix)
コード例 #2
 def __init__(self, rgb):
     self._rgb = np.array(rgb)
     if ((self._rgb < 0) | (self._rgb > 255)).any():
         raise ValueError(
             f'Invalid colour component value(s). Given rgb: {self._rgb}')
     if self._rgb.dtype != int:
         raise ValueError(
             Colour component value(s) must be int. Given values have type
コード例 #3
def _estimate_cutoffs(matrix, percentiles):
    Estimate upper and lower correlation cutoffs

    Takes the x-th and y-th percentile of a sample similarity matrix of
    `matrix`, returning these as the lower and upper cut-off respectively.

    Using a sample as calculating all correlations is n**2. Sample size is
    chosen to be easy enough to calculate; for a large matrix our estimate
    is less accurate than for a small matrix. We could report a confidence
    interval for the cut-off estimate if that bothers us or more likely delve
    into statistics to find a better cut-off. In practice the user just plays
    with the percentiles until the result is what they want; so this is good
    matrix_df = matrix.data

    # Take a sample unless it's a tiny matrix
    if len(matrix_df) <= 800:
        sample = matrix_df
        sample_size = 800
        sample = np.random.choice(len(matrix_df), sample_size, replace=False)
        sample = matrix_df.iloc[sample]

    sample = sample.sort_index()  # for prettier output later
    cors = pearson_df(sample, sample)

    # Get the upper triangle as 1D array, excluding the diagonal.
    # The diagonal is pearson(x, x) == 1, so we ignore that. The matrix is
    # symmetric as pearson(x, y) == pearson(y, x); so we only need to look at
    # the upper triangle.
    triu = cors.values[np.triu_indices(len(cors), 1)]

    # Warn if >10% NaN
    nan_count = np.isnan(triu).sum()
    if nan_count > triu.size * .1:
            Correlation sample of {matrix} contains more than 10% NaN values,
            specifically {nan_count}/{triu.size} correlations are NaN (only
            including non-diagonal upper triangle correlation matrix values).

    # Ignore NaN values when calculating percentiles
    triu = triu[~np.isnan(triu)]
    cutoffs = np.percentile(triu, percentiles)

    return cors, cutoffs
コード例 #4
    def from_float(rgb):
        Create RGB from float array-like.

        rgb : ~pytil.numpy.ArrayLike[float]
            ``(red, green, blue)`` array with values between 0 and 1.
        rgb = np.array(rgb)
        if ((rgb < 0.0) | (rgb > 1.0)).any():
            raise ValueError(
                Invalid component value(s), should be float in range of [0, 1].
                Given rgb: {rgb}
        return RGB((rgb * 255).round().astype(int))
コード例 #5
def _parse_percentiles(args):
    lower_percentile = args['lower_percentile']
    upper_percentile = args['upper_percentile']
    if lower_percentile < 0:
        raise UserError(
            f'Lower percentile must be at least 0. Got: {lower_percentile}'
    if upper_percentile > 100:
        raise UserError(
            f'Upper percentile must be at most 100. Got: {upper_percentile}'
    if lower_percentile > upper_percentile:
        raise ValueError(join_lines(
            Lower percentile must be less or equal to upper percentile, got:
            {lower_percentile}, {upper_percentile}
    return np.array([lower_percentile, upper_percentile])
コード例 #6
def _validate_matrices(baits, matrices):
    if not matrices:
        raise UserError(join_lines(
            Must provide at least one expression matrix, got:
    names = [matrix.name for matrix in matrices]
    if len(matrices) != len(set(names)):
        raise UserError(
            f'Expression matrices must have unique name, got: {sorted(names)}'

    # Check each bait occurs in exactly one matrix
    bait_presence = np.array([
        bait in matrix.data.index
        for matrix, bait in product(matrices, baits)
    bait_presence = bait_presence.reshape(len(matrices), len(baits))
    missing_bait_matrix = pd.DataFrame(
    missing_bait_matrix = missing_bait_matrix.loc[:,bait_presence.sum(axis=0) != 1]
    # pylint: disable=trailing-whitespace
    if not missing_bait_matrix.empty:
        missing_bait_matrix = missing_bait_matrix.applymap(lambda x: 'present' if x else 'absent')
        missing_bait_matrix.index = missing_bait_matrix.index.map(lambda matrix: matrix.name)
        missing_bait_matrix.index.name = 'Matrix name'
        missing_bait_matrix.columns.name = 'Gene name'
        raise UserError(dedent(
            Each of the following baits is either missing from all or present in
            multiple expression matrices:
            Missing baits are columns with no "present" value, while baits in
            multiple matrices have multiple "present" values in a column.'''

    # and each matrix has at least one bait
    is_baitless = bait_presence.sum(axis=1) == 0
    if is_baitless.any():
        matrices = np.array(matrices)[is_baitless]
        matrices = ', '.join(map(str, matrices))
        raise UserError(join_lines(
            Some expression matrices have no baits: {matrices}. Each expression
            matrix must contain at least one bait. Either drop the matrices or
            add some of their genes to the baits list.

    # Check the matrices don't overlap (same gene in multiple matrices)
    all_genes = pd.Series(sum((list(matrix.data.index) for matrix in matrices), []))
    overlapping_genes = all_genes[all_genes.duplicated()]
    if not overlapping_genes.empty:
        raise UserError(join_lines(
            The following genes appear in multiple expression matrices:
            {', '.join(overlapping_genes)}. CoExpNetViz does not support gene
            expression data from different matrices for the same gene. Please
            remove rows from the given matrices such that no gene appears in
            multiple matrices.