Example #1
0
def sequence_logo(score_matrix,
                  order="value",
                  width=1.0,
                  ax=None,
                  sequence_type=Genome,
                  font_properties=None,
                  color_scheme=None,
                  **kwargs):
    """Plots a sequence logo for visualizing motifs.

    Parameters
    ----------
    score_matrix : np.ndarray
        An :math:`L \\times N` array (where :math:`L` is the length of
        the sequence, and :math:`N` is the size of the alphabet)
        containing the scores for each base occuring at each position.
    order : {'alpha', 'value'}
        The manner by which to sort the bases stacked at each position
        in the sequence logo plot.

            * 'alpha' - Bases go in the order they are found in the\
                      sequence alphabet.
            * 'value' - Bases go in the order of their value, with the\
                        largest at the bottom.
    width : float, optional
        Default is 1. The width of each character in the plotted logo.
        A value of 1 will mean that there is no gap between each the
        characters at each position. A value of 0 will not draw any
        characters.
    ax : matplotlib.pyplot.Axes or None, optional
        Default is `None`. The axes to plot on. If left as `None`, a new
        axis will be created.
    sequence_type : class, optional
        Default is `selene_sdk.sequences.Genome`. The type of sequence that
        the *in silico* mutagenesis results are associated with. This
        should generally be a subclass of `selene_sdk.sequences.Sequence`.
    font_properties : matplotlib.font_manager.FontProperties or None, optional
        Default is `None`. A `matplotlib.font_manager.FontProperties`
        object that specifies the properties of the font to use for
        plotting the motif. If `None`, no font will be used, and the
        text will be rendered by a path. This method of rendering paths
        is  preferred, as it ensures all character heights correspond to
        the actual values, and that there are no extra gaps between
        the tops and bottoms of characters at each position in the
        sequence logo. If the user opts to use a value other
        than `None`, then no such guarantee can be made.
    color_scheme : list(str) or None, optional
        Default is `None`. A list containing the hex codes or names of
        colors to use, appearing in the order of the bases of the
        sequence type. If left as `None`, a default palette will be made
        with `seaborn.color_palette`, and will have as many
        colors as there are characters in the input sequence alphabet.

    Returns
    -------
    matplotlib.pyplot.Axes
        The axes containing the sequence logo plot.

    Raises
    ------
    ValueError
        If the number of columns in `score_matrix` does not match the
        number of characters in the alphabet of `sequence_type`.

    ValueError
        If the number of colors in `color_palette` does not match the
        number of characters in the alphabet of `sequence_type`.

    Examples
    --------
    We have included an example of the output from a`sequence_logo`
    plot below:

    .. image:: ../../docs/source/_static/img/sequence_logo_example.png

    """
    # Note that everything will break if we do not deepcopy.
    score_matrix = deepcopy(score_matrix)

    score_matrix = score_matrix.transpose()
    if font_properties is not None:
        warnings.warn(
            "Specifying a value for `font_properties` (other than `None`) "
            "will use the `matplotlib`-based character paths, and causes "
            "distortions in the plotted motif. We recommend leaving "
            "`font_properties=None`. See the documentation for details.",
            UserWarning)

    if color_scheme is None:
        color_scheme = sns.color_palette("Set1",
                                         n_colors=len(sequence_type.BASES_ARR))
        color_scheme = color_scheme.as_hex()
    if len(color_scheme) < len(sequence_type.BASES_ARR):
        raise ValueError(
            "Color scheme is shorter than number of bases in sequence.")

    if score_matrix.shape[0] != len(sequence_type.BASES_ARR):
        raise ValueError(
            "Got score with {0} bases for sequence with {1} bases.".format(
                score_matrix.shape[0], len(sequence_type.BASES_ARR)))
    if ax is None:
        _, ax = plt.subplots(figsize=score_matrix.shape)

    # Determine offsets depending on sort order.
    positive_offsets = np.zeros_like(score_matrix)
    negative_offsets = np.zeros_like(score_matrix)
    bases = np.empty(score_matrix.shape, dtype=object)
    bases[:, :] = "?"  # This ensures blanks are visually obvious.

    # Change ordering of things based on input arguments.
    if order == "alpha":
        for i in range(score_matrix.shape[0]):
            bases[i, :] = sequence_type.BASES_ARR[i]

    elif order == "value":
        if np.sum(score_matrix < 0) != 0:
            sorted_scores = np.zeros_like(score_matrix)
            for j in range(score_matrix.shape[1]):
                # Sort the negative values and put them at bottom.
                div = np.sum(score_matrix[:, j] < 0.)
                negative_idx = np.argwhere(score_matrix[:, j] < 0.).flatten()
                negative_sort_idx = np.argsort(score_matrix[negative_idx, j],
                                               axis=None)
                sorted_scores[:div, j] = score_matrix[
                    negative_idx[negative_sort_idx], j]
                bases[:div, j] = sequence_type.BASES_ARR[
                    negative_idx[negative_sort_idx]].flatten()

                # Sort the positive values and stack atop the negatives.
                positive_idx = np.argwhere(score_matrix[:, j] >= 0.).flatten()
                positive_sort_idx = np.argsort(score_matrix[positive_idx, j],
                                               axis=None)
                sorted_scores[div:, j] = score_matrix[
                    positive_idx[positive_sort_idx], j]
                bases[div:, j] = sequence_type.BASES_ARR[
                    positive_idx[positive_sort_idx]].flatten()
            score_matrix = sorted_scores
        else:
            for j in range(score_matrix.shape[1]):
                sort_idx = np.argsort(score_matrix[:, j], axis=None)[::-1]
                bases[:, j] = sequence_type.BASES_ARR[sort_idx]
                score_matrix[:, j] = score_matrix[sort_idx, j]

    # Create offsets for each bar.
    for i in range(score_matrix.shape[0] - 1):
        y_coords = score_matrix[i, :]
        if i > 0:
            negative_offsets[i + 1, :] = negative_offsets[i, :]
            positive_offsets[i + 1, :] = positive_offsets[i, :]
        neg_idx = np.argwhere(y_coords < 0.)
        pos_idx = np.argwhere(y_coords >= 0.)
        negative_offsets[i + 1, neg_idx] += y_coords[neg_idx]
        positive_offsets[i + 1, pos_idx] += y_coords[pos_idx]

    for i in range(score_matrix.shape[0]):
        x_coords = np.arange(score_matrix.shape[1]) + 0.5
        y_coords = score_matrix[i, :]

        # Manage negatives and positives separately.
        offsets = np.zeros(score_matrix.shape[1])
        negative_idx = np.argwhere(y_coords < 0.)
        positive_idx = np.argwhere(y_coords >= 0.)
        offsets[negative_idx] = negative_offsets[i, negative_idx]
        offsets[positive_idx] = positive_offsets[i, positive_idx]
        bars = ax.bar(x_coords,
                      y_coords,
                      color="black",
                      width=width,
                      bottom=offsets)
        for j, bar in enumerate(bars):
            base = bases[i, j]
            bar.set_color(color_scheme[sequence_type.BASE_TO_INDEX[base]])
            bar.set_edgecolor(None)

    # Iterate over the barplot's bars and turn them into letters.
    new_patches = []
    for i, bar in enumerate(ax.patches):
        base_idx = i // score_matrix.shape[1]
        seq_idx = i % score_matrix.shape[1]
        base = bases[base_idx, seq_idx]
        # We construct a text path that tracks the bars in the barplot.
        # Thus, the barplot takes care of scaling and translation,
        #  and we just copy it.
        if font_properties is None:
            text = Path(_SVG_PATHS[base][0], _SVG_PATHS[base][1])
        else:
            text = TextPath((0., 0.), base, fontproperties=font_properties)
        b_x, b_y, b_w, b_h = bar.get_extents().bounds
        t_x, t_y, t_w, t_h = text.get_extents().bounds
        scale = (b_w / t_w, b_h / t_h)
        translation = (b_x - t_x, b_y - t_y)
        text = PathPatch(text, facecolor=bar.get_facecolor(), lw=0.)
        bar.set_facecolor("none")
        text.set_path_effects([_TextPathRenderingEffect(bar)])
        transform = transforms.Affine2D().translate(*translation).scale(*scale)
        text.set_transform(transform)
        new_patches.append(text)

    for patch in new_patches:
        ax.add_patch(patch)
    ax.set_xlim(0, score_matrix.shape[1])
    ax.set_xticks(np.arange(score_matrix.shape[1]) + 0.5)
    ax.set_xticklabels(np.arange(score_matrix.shape[1]))
    return ax