Ejemplo n.º 1
0
def cool2pixels(cool: cooler.api.Cooler):
    """Return the contact matrix in "pixels" format

    Args:
        cool: Input cooler object

    Returns:
        cooler.core.RangeSelector2D object
    """
    return cool.matrix(as_pixels=True, balance=False, sparse=True)
Ejemplo n.º 2
0
def get_frag(c: cooler.api.Cooler,
             resolution: int,
             offsets: pd.core.series.Series,
             chrom1: str,
             start1: int,
             end1: int,
             chrom2: str,
             start2: int,
             end2: int,
             width: int = 22,
             height: int = -1,
             padding: int = 10,
             normalize: bool = True,
             balanced: bool = True,
             percentile: float = 100.0,
             ignore_diags: int = 0,
             no_normalize: bool = False) -> np.ndarray:
    """
    Retrieves a matrix fragment.

    Args:
        c:
            Cooler object.
        chrom1:
            Chromosome 1. E.g.: `1` or `chr1`.
        start1:
            First start position in base pairs relative to `chrom1`.
        end1:
            First end position in base pairs relative to `chrom1`.
        chrom2:
            Chromosome 2. E.g.: `1` or `chr1`.
        start2:
            Second start position in base pairs relative to `chrom2`.
        end2:
            Second end position in base pairs relative to `chrom2`.
        offsets:
            Pandas Series of chromosome offsets in bins.
        width:
            Width of the fragment in pixels.
        height:
            Height of the fragments in pixels. If `-1` `height` will equal
            `width`. Defaults to `-1`.
        padding: Percental padding related to the dimension of the fragment.
            E.g., 10 = 10% padding (5% per side). Defaults to `10`.
        normalize:
            If `True` the fragment will be normalized to [0, 1].
            Defaults to `True`.
        balanced:
            If `True` the fragment will be balanced using Cooler.
            Defaults to `True`.
        percentile:
            Percentile clip. E.g., For 99 the maximum will be
            capped at the 99-percentile. Defaults to `100.0`.
        ignore_diags:
            Number of diagonals to be ignored, i.e., set to 0.
            Defaults to `0`.
        no_normalize:
            If `true` the returned matrix is not normalized.
            Defaults to `False`.

    Returns:

    """

    if height is -1:
        height = width

    # Restrict padding to be [0, 100]%
    padding = min(100, max(0, padding)) / 100

    try:
        offset1 = offsets[chrom1]
        offset2 = offsets[chrom2]
    except KeyError:
        # One more try before we will fail miserably
        offset1 = offsets['chr{}'.format(chrom1)]
        offset2 = offsets['chr{}'.format(chrom2)]

    start_bin1 = offset1 + int(round(float(start1) / resolution))
    end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1

    start_bin2 = offset2 + int(round(float(start2) / resolution))
    end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1

    # Apply percentile padding
    padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding))
    padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding))
    start_bin1 -= padding1
    start_bin2 -= padding2
    end_bin1 += padding1
    end_bin2 += padding2

    # Get the size of the region
    dim1 = end_bin1 - start_bin1
    dim2 = end_bin2 - start_bin2

    # Get additional absolute padding if needed
    padding1 = 0
    if dim1 < width:
        padding1 = int((width - dim1) / 2)
        start_bin1 -= padding1
        end_bin1 += padding1

    padding2 = 0
    if dim2 < height:
        padding2 = int((height - dim2) / 2)
        start_bin2 -= padding2
        end_bin2 += padding2

    # In case the final dimension does not math the desired dimension we
    # increase the end bin. This can be caused when the padding is not
    # divisible by 2, since the padding is rounded to the nearest integer.
    abs_dim1 = abs(start_bin1 - end_bin1)
    if abs_dim1 < width:
        end_bin1 += width - abs_dim1
        abs_dim1 = width

    abs_dim2 = abs(start_bin2 - end_bin2)
    if abs_dim2 < height:
        end_bin2 += height - abs_dim2
        abs_dim2 = height

    # Maximum width / height is 512
    if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()
    if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()

    # Finally, adjust to negative values.
    # Since relative bin IDs are adjusted by the start this will lead to a
    # white offset.
    real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0
    real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0

    # Get the data
    data = c.matrix(as_pixels=True, balance=False,
                    max_chunk=np.inf)[real_start_bin1:end_bin1,
                                      real_start_bin2:end_bin2]

    # Annotate pixels for balancing
    bins = c.bins(convert_enum=False)[['weight']]
    data = cooler.annotate(data, bins, replace=False)

    # Calculate relative bin IDs
    rel_bin1 = np.add(data['bin1_id'].values, -start_bin1)
    rel_bin2 = np.add(data['bin2_id'].values, -start_bin2)

    # Balance counts
    if balanced:
        values = data['count'].values.astype(np.float32)
        values *= data['weight1'].values * data['weight2'].values
    else:
        values = data['count'].values

    # Get pixel IDs for the upper triangle
    idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2)

    # Mirror matrix
    idx2_1 = np.add(data['bin2_id'].values, -start_bin1)
    idx2_2 = np.add(data['bin1_id'].values, -start_bin2)
    idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2)
    validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0))

    # Ignore diagonals
    diags_start_row = None
    if ignore_diags > 0:
        try:
            diags_start_idx = np.min(
                np.where(data['bin1_id'].values == data['bin2_id'].values))
            diags_start_row = (rel_bin1[diags_start_idx] -
                               rel_bin2[diags_start_idx])
        except ValueError:
            pass

    # Copy pixel values onto the final array
    frag_len = abs_dim1 * abs_dim2
    frag = np.zeros(frag_len, dtype=np.float32)
    # Make sure we're within the bounds
    idx1_f = np.where(idx1 < frag_len)
    frag[idx1[idx1_f]] = values[idx1_f]
    frag[idx2[validBins]] = values[validBins]
    frag = frag.reshape((abs_dim1, abs_dim2))

    # Store low quality bins
    low_quality_bins = np.where(np.isnan(frag))

    # Assign 0 for now to avoid influencing the max values
    frag[low_quality_bins] = 0

    # Scale fragment down if needed
    scaled = False
    scale_x = width / frag.shape[0]
    if frag.shape[0] > width or frag.shape[1] > height:
        scaledFrag = np.zeros((width, height), float)
        frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1)
        scaled = True

    # Normalize by minimum
    if not no_normalize:
        min_val = np.min(frag)
        frag -= min_val

    ignored_idx = None

    # Remove diagonals
    if ignore_diags > 0 and diags_start_row is not None:
        if width == height:
            scaled_row = int(np.rint(diags_start_row / scale_x))

            idx = np.diag_indices(width)
            scaled_idx = (idx if scaled_row == 0 else
                          [idx[0][scaled_row:], idx[0][:-scaled_row]])

            for i in range(ignore_diags):

                # First set all cells to be ignored to `-1` so that we can
                # easily query for them later.
                if i == 0:
                    frag[scaled_idx] = -1
                else:
                    dist_to_diag = scaled_row - i
                    dist_neg = min(0, dist_to_diag)
                    off = 0 if dist_to_diag >= 0 else i - scaled_row

                    # Above diagonal
                    frag[((scaled_idx[0] - i)[off:],
                          (scaled_idx[1])[off:])] = -1

                    # Extra cutoff at the bottom right
                    frag[(range(
                        scaled_idx[0][-1] - i,
                        scaled_idx[0][-1] + 1 + dist_neg,
                    ),
                          range(scaled_idx[1][-1],
                                scaled_idx[1][-1] + i + 1 + dist_neg))] = -1

                    # Below diagonal
                    frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1

            # Save the final selection of ignored cells for fast access
            # later and set those values to `0` now.
            ignored_idx = np.where(frag == -1)
            frag[ignored_idx] = 0

        else:
            logger.warn(
                'Ignoring the diagonal only supported for squared features')

    # Capp by percentile
    max_val = np.percentile(frag, percentile)
    frag = np.clip(frag, 0, max_val)

    # Normalize by maximum
    if not no_normalize and max_val > 0:
        frag /= max_val

    # Set the ignored diagonal to the maximum
    if ignored_idx:
        frag[ignored_idx] = 1.0

    if not scaled:
        # Recover low quality bins
        frag[low_quality_bins] = -1

    return frag