Beispiel #1
0
 def multiplex( row, seqID, muts ):
     seqNM = _check_column(row, "sequence", seqID)
     idNM = "description"
     seq   = list(row[seqNM])
     for p in muts:
         # -1 because we are going to access string positions.
         shift = get_selection(p[0], seqID, row.get_reference_shift(seqID))[0] - 1
         seq[shift] = p[1]
     data = {seqNM: ["".join(x) for x in itertools.product(*seq)]}
     data[seqNM].insert(0, row[seqNM])
     name = row.get_id()
     if not bool(re.search("_v\d+$", row.get_id())):
         data[idNM] = [name + "_v{0:04d}".format(x) for x in range(len(data[seqNM]))]
         data[idNM][0] = name
     else:
         data[idNM] = [name + "_v{0:04d}".format(x) for x in range(1, len(data[seqNM]) + 1)]
     if keep_scores:
         for col in row.index:
             if col not in [seqNM, idNM]:
                 data[col] = [row[col]] * len(data[idNM])
     else:
         for seq in row.get_available_sequences():
             if seq != seqID:
                 data[_check_column(row, "sequence", seq)] = row.get_sequence(seq)
     df = row._constructor_expanddim(data)
     return df
Beispiel #2
0
def _get_key_reference(obj, ctype, seqID, key_residues):
    from rstoolbox.components import get_selection

    seq = _get_reference(obj, ctype, seqID)
    sft = _get_reference(obj, "sft", seqID)
    if key_residues is None:
        return seq

    kr = get_selection(key_residues, seqID, sft, len(seq))

    # -1 as we are accessing string count
    return "".join(np.array(list(seq))[kr - 1])
Beispiel #3
0
 def mutations( reference, row, seqID ):
     data = []
     datn = []
     sequence = row.get_sequence(seqID)
     if len(reference) != len(sequence):
         raise ValueError("Sequence lengths do not match")
     for i, refi in enumerate(reference):
         if refi.upper() != sequence[i].upper():
             shift = get_selection(i + 1, seqID, row.get_reference_shift(seqID))[0]
             data.append(refi.upper() + str(shift) + sequence[i].upper())
             datn.append(str(shift))
     return ",".join(data), ",".join(datn), len(data)
Beispiel #4
0
 def format_mutations( row, seqID, key_residues ):
     shift = row.get_reference_shift(seqID)
     seq = row.get_sequence(seqID)
     kr = get_selection(key_residues, seqID, shift, len(seq))
     mutations = row.get_mutations(seqID).split(",")
     muts = []
     if mutations != ['']:
         for m in mutations:
             m = m.strip()
             pos = int(re.search("(\d+)", m).group(1))
             if pos in kr:
                 muts.append((pos, "".join([m[0], m[-1]])))
     return muts
Beispiel #5
0
def _get_key_sequence(obj, ctype, seqID, key_residues):
    from rstoolbox.components import get_selection
    from .reference import get_reference_shift

    seq = obj[_check_column(obj, ctype, seqID)]
    sft = get_reference_shift(obj, seqID)

    if isinstance(obj, pd.Series):
        length = len(seq)
    else:
        length = len(seq.iloc[0])
    kr = get_selection(key_residues, seqID, sft, length)

    if isinstance(obj, pd.Series):
        if len(kr) > 1:
            # -1 because we access string positions
            return "".join(np.array(list(seq))[kr - 1])
        else:
            return ""
    else:
        if len(kr) > 1:
            return seq.apply(lambda seq: "".join(np.array(list(seq))[kr - 1]))
        else:
            return seq.apply(lambda seq: "")
Beispiel #6
0
def generate_mutants_from_matrix( self, seqID, matrix, count,
                                  key_residues=None, limit_refseq=False ):
    """From a provided positional frequency matrix, generates ``count`` random variants.

    It takes into account the individual frequency assigned to each residue type and
    position. It does **not** generate the highest possible scored sequence according to
    the matrix, but picks randomly at each position according to the frequencies in for
    that position.

    For each :class:`.DesignSeries`, it will generate a :class:`.DesignFrame` in which the
    original sequence becomes the ``reference_sequence``, inheriting the ``reference_shift``.

    .. warning::
        This is a **computationaly expensive** function. Take this in consideration when trying
        to run it.

    Each :class:`.DesignFrame` will have the following structure:

    ======================  ============================================
    Column                                                Data Content
    ======================  ============================================
    **description**         Identifier fo the mutant
    **sequence_<seqID>**    Sequence content
    **pssm_score_<seqID>**  Score obtained by applying ``matrix``
    ======================  ============================================

    :param str seqID: |seqID_param|
    :param matrix: Positional frequency matrix. **column:** residue type; **index:**
        sequence position.
    :type matrix: :class:`~pandas.DataFrame`
    :param int count: Expected number of **unique** generated combinations. If the number is
        bigger than the possible options, it will default to the total amount of options.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_types|
    :param bool limit_refseq: When :data:`True`, pick only residue types with probabilities
        equal or higher to the source sequence.

    :return: :func:`list` of :class:`.DesignFrame` - New set of design sequences.

    :raises:
        :ValueError: if matrix rows do not match sequence length.

    .. seealso::
        :meth:`.DesignFrame.generate_mutant_variants`
        :meth:`.DesignFrame.score_by_pssm`
        :meth:`.DesignSeries.generate_mutant_variants`
        :meth:`.DesignSeries.score_by_pssm`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.tests.helper import random_frequency_matrix
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {'scores': ['score', 'description'], 'sequence': 'B'})
           ...: df.add_reference_sequence('B', df.get_sequence('B').values[0])
           ...: matrix = random_frequency_matrix(len(df.get_reference_sequence('B')), 0)
           ...: key_res = [3,5,8,12,15,19,25,27]
           ...: mutants = df.iloc[1].generate_mutants_from_matrix('B', matrix, 5, key_res)
           ...: mutants[0].identify_mutants('B')

    """
    from rstoolbox.components import get_selection
    from rstoolbox.components import DesignSeries, DesignFrame

    def max_options( matrix, seq, key_residues, limit_refseq):
        if limit_refseq is False:
            return np.power(20, len(key_residues))
        else:
            ori_index = matrix.index
            matrix = matrix.copy()
            matrix.index = range(0, matrix.shape[0])
            options = (matrix.apply(lambda row: np.sum(row >= row[seq[row.name]]), axis=1))
            options.index = ori_index
            return np.prod(options[key_residues])

    data = []
    if isinstance(self, pd.DataFrame):
        for _, row in self.iterrows():
            data.extend(row.generate_mutants_from_matrix(seqID, matrix, count,
                                                         key_residues, limit_refseq))
        return data

    if matrix.shape[0] != len(self.get_sequence(seqID)):
        raise ValueError("Matrix rows and sequence length should match.")
    # Make sure index and sequence shift match
    matrix = matrix.copy()
    shift = self.get_reference_shift(seqID)
    matrix.index = get_selection(None, seqID, shift, length=matrix.shape[0])

    if key_residues is not None:
        key_residues = get_selection(key_residues, seqID, shift, matrix.shape[0])
    else:
        key_residues = list(matrix.index.values)

    seqnm = "sequence_{}".format(seqID)
    data.append(DesignFrame([], columns=["description", seqnm]))
    name  = self.get_id()

    options = max_options(matrix, self.get_sequence(seqID), key_residues, limit_refseq)
    # some numbers are just too big for python...
    if options <= 0:
        options = count + 1

    while data[-1].shape[0] < min(count, options):
        seqaa = list(self.get_sequence(seqID))
        thisname = name + "_v{0:04d}".format(data[-1].shape[0] + 1)
        for aap in key_residues:
            matI = matrix.loc[aap].copy()
            if limit_refseq:
                matI[matI < matI[seqaa[aap - 1]]] = 0
                matI = matI / matI.sum()
            seqaa[aap - 1] = np.random.choice(matI.index.values, 1, p=list(matI))[0]
        if "".join(seqaa) == self.get_sequence(seqID):
            continue
        data[-1] = data[-1].append(DesignSeries([thisname, "".join(seqaa)],
                                                ["description", seqnm]),
                                   ignore_index=True)
        data[-1].drop_duplicates([seqnm])
    data[-1].add_reference(seqID, self.get_sequence(seqID), shift=self.get_reference_shift(seqID))
    data[-1] = data[-1].score_by_pssm(seqID, matrix)
    return data
Beispiel #7
0
def logo_plot(df,
              seqID,
              refseq=True,
              key_residues=None,
              line_break=None,
              font_size=35,
              colors="WEBLOGO"):
    """Generates classic **LOGO** plots.

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param bool refseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residue: |keyres_param|
    :param int line_break: Force a line-change in the plot after n residues are plotted.
    :param float font_size: Expected size of the axis font.
    :param colors: Colors to assign; it can be the name of a available color set or
        a dictionary with a color for each type.
    :type colors: Union[:class:`str`, :class:`dict`]

    :return: :class:`~matplotlib.figure.Figure` and
        :func:`list` of :class:`~matplotlib.axes.Axes`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import logo_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: df.add_reference_sequence("B", df.get_sequence("B")[0])
           ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50)
           ...: plt.tight_layout()

        @savefig sequence_logo_plot_docs.png width=5in
        In [2]: plt.show()
    """
    def _letterAt(letter,
                  x,
                  y,
                  yscale=1,
                  ax=None,
                  globscale=1.35,
                  LETTERS=None,
                  COLOR_SCHEME=None):
        text = LETTERS[letter]
        t = mpl.transforms.Affine2D().scale(1 * globscale, yscale * globscale) + \
            mpl.transforms.Affine2D().translate(x, y) + ax.transData
        p = PathPatch(text, lw=0, fc=COLOR_SCHEME[letter], transform=t)
        if ax is not None:
            ax.add_artist(p)
        return p

    def _dataframe2logo(data):
        aa = list(data)
        odata = []
        for _, pos in data.iterrows():
            pdata = []
            for k in aa:
                if pos[k] > 0.0000000:
                    pdata.append((k, float(pos[k])))
            odata.append(sorted(pdata, key=operator.itemgetter(1, 0)))
        return odata

    def _chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)

    mpl.rcParams['svg.fonttype'] = 'none'
    # Graphical Properties of resizable letters
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                        '../components/square.ttf')
    fp = FontProperties(fname=path, weight="bold")
    globscale = 1.22
    letters_shift = -0.5
    LETTERS = {}
    for aa in color_scheme(colors):
        LETTERS[aa] = TextPath((letters_shift, 0), aa, size=1, prop=fp)

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersection(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)

    # key_residues management.
    length = len(data.get_reference_sequence(seqID)) if refseq else None
    key_residues = get_selection(key_residues, seqID, list(data.index.values),
                                 length)

    # Plot
    if line_break is None:
        figsize = (len(data) * 2, 2.3 * 2)
        grid = (1, 1)
        fig = plt.figure(figsize=figsize)
        axs = [
            plt.subplot2grid(grid, (0, 0)),
        ]
        krs = [
            key_residues,
        ]
    else:
        rows = int(math.ceil(float(len(data)) / line_break))
        figsize = (float(len(data) * 2) / rows, 2.3 * 2 * rows)
        grid = (rows, 1)
        fig = plt.figure(figsize=figsize)
        axs = [plt.subplot2grid(grid, (_, 0)) for _ in range(rows)]
        krs = list(_chunks(key_residues, line_break))

    font = FontProperties()
    font.set_size(font_size)
    font.set_weight('bold')

    for _, ax in enumerate(axs):
        # Refseq and key_residues management.
        ref_seq = data.get_reference_sequence(seqID, krs[_]) if refseq else ""
        # data and key_residues management.
        _data = data.get_key_residues(krs[_])

        maxv = int(math.ceil(data.max_hight()))

        ticks = len(_data)
        if line_break is not None and len(_data) < line_break:
            ticks = line_break
        ax.set_xticks(np.arange(0.5, ticks + 1))
        ax.set_yticks(range(0, maxv + 1))
        ax.set_xticklabels(_data.index.values)
        ax.set_yticklabels(np.arange(0, maxv + 1, 1))
        if ref_seq is not None:
            ax2 = ax.twiny()
            ax2.set_xticks(ax.get_xticks())
            ax2.set_xticklabels(list(ref_seq))
        sns.despine(ax=ax, trim=True)
        ax.grid(False)
        if ref_seq is not None:
            sns.despine(ax=ax2, top=False, right=True, left=True, trim=True)
            ax2.grid(False)
        ax.lines = []
        wdata = _dataframe2logo(_data)
        x = 0.5
        maxi = 0
        for scores in wdata:
            y = 0
            for base, score in scores:
                _letterAt(base, x, y, score, ax, globscale, LETTERS,
                          color_scheme(colors))
                y += score
            x += 1
            maxi = max(maxi, y)
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontproperties(font)
        if ref_seq is not None:
            for label in (ax2.get_xticklabels() + ax2.get_yticklabels()):
                label.set_fontproperties(font)

    return fig, axs
Beispiel #8
0
def per_residue_matrix_score_plot(df,
                                  seqID,
                                  ax,
                                  matrix="BLOSUM62",
                                  selections=None,
                                  **kwargs):
    """Plot a linear representation of the scoring obtained by applying a
    substitution matrix.

    Applies to a single decoy against the ``reference_sequence``.

    Parameters to control the properties of the plotted line (``color``,
    ``linestyle``...) can be provided too.

    :param df: |df_param|
    :type df: :class:`.DesignSeries`
    :param str seqID: |seqID_param|
    :param ax: matplotlib axis to which we will plot.
    :type ax: :py:class:`~matplotlib.axes.Axes`
    :param str matrix: |matrix_param|
    :param selections: List of regions to highlight; each position should be
        a selector and a color.
    :type selections: :func:`list` of :class:`tuple` with |keyres_types| and
        a color (:class:`str` or :class:`int`)

    :raises:
        :ValueError: If the data container is not :class:`.DesignSeries` or it
            does not have a ``reference_sequence``.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import per_residue_matrix_score_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: df.add_reference_sequence('B', df.iloc[0]['sequence_B'])
           ...: df.add_reference_shift('B', 10)
           ...: seles = [('15-25', 'red'), ('45B-60B', 'green')]
           ...: fig = plt.figure(figsize=(25, 10))
           ...: ax0 = plt.subplot2grid((2, 1), (0, 0))
           ...: per_residue_matrix_score_plot(df.iloc[1], "B", ax0)
           ...: ax1 = plt.subplot2grid((2, 1), (1, 0))
           ...: per_residue_matrix_score_plot(df.iloc[1], "B", ax1, selections=seles)

        @savefig per_residue_matrix_score_plot_docs.png width=5in
        In [2]: plt.show()


    """
    if not isinstance(df,
                      DesignSeries) or not df.has_reference_sequence(seqID):
        raise ValueError(
            "Data must be a DesignSeries with reference for the requested seqID"
        )

    shift = df.get_reference_shift(seqID)
    refsq = df.get_reference_sequence(seqID)

    column = '{0}_{1}_per_res'.format(matrix.lower(), seqID)
    if column not in df.index:
        df = sequence_similarity(df.to_frame().T, seqID, matrix=matrix).iloc[0]

    ax.plot(range(0, len(refsq)), [
        0,
    ] * len(refsq),
            color='grey',
            linestyle='dashed')
    ax.plot(range(0, len(refsq)), df[column], **kwargs)

    ax.set_xlim(0, len(refsq) - 1)
    ax.set_xticks(range(0, len(refsq), 5))
    if isinstance(shift, int):
        ax.set_xticklabels([_ + shift for _ in range(0, len(refsq), 5)])
    else:
        ax.set_xticklabels(shift[0::5])

    axb = ax.twiny()
    axb.set_xticks(range(0, len(refsq)))
    axb.set_xticklabels(
        list(df['{0}_{1}_ali'.format(matrix.lower(), seqID)].replace('.',
                                                                     ' ')))
    axb.tick_params('x', top=False, pad=0)

    axlim = ax.get_ylim()
    if selections is None:
        selections = []
    for s in selections:
        xift = False
        try:
            xift = Selection(s[0]).is_shifted()
        except AttributeError:
            xift = False
        s_ = get_selection(s[0], seqID, shift, len(refsq))

        ax.fill([
            s_[0] - int(xift), s_[-1] - int(xift), s_[-1] - int(xift),
            s_[0] - int(xift)
        ], [axlim[0] - 1, axlim[0] - 1, axlim[1] + 1, axlim[1] + 1],
                color=s[1],
                alpha=0.2,
                zorder=-100)
    ax.set_ylim(axlim[0], axlim[1])

    ax.set_ylabel(matrix.upper())
Beispiel #9
0
def positional_structural_count(df, seqID=None, key_residues=None):
    """Percentage of secondary structure types for each sequence position of all
    decoys.

    The secondary structure dictionary is a minimized one: ``H``, ``E`` and ``L``.

    :param df: |df_param|.
    :type df: Union[:py:class:`.DesignFrame`, :py:class:`.FragmentFrame`]
    :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame`.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_types|

    :return: :class:`~pandas.DataFrame` - where rows are sequence positions and
        columns are the secondary structure identifiers ``H``, ``E``, ``L``.

    :raises:
        :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`,
            :class:`.FragmentFrame`]. It will *not* try to cast a provided
            :class:`~pandas.DataFrame`, as it would not be possible to know into which of
            the two possible inputs it needs to be casted.
        :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided.
        :KeyError: |sseID_error| when input is :class:`.DesignFrame`.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.analysis import positional_structural_count
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_ssebig.minisilent.gz",
           ...:                         {'scores': ['score'], 'structure': 'C'})
           ...: df = positional_structural_count(df.iloc[1:], 'C')
           ...: df.head()
    """
    from rstoolbox.components import DesignFrame, FragmentFrame
    from rstoolbox.components import get_selection
    data = {"H": [], "E": [], "L": []}

    if isinstance(df, DesignFrame):
        if seqID is None:
            raise AttributeError("seqID needs to be provided")
        if not "structure_{}".format(seqID) in df:
            raise KeyError("Structure {} not found in decoys.".format(seqID))
        seqdata = df.get_sequential_data('structure', seqID)
        seqdata = seqdata.apply(lambda x: pd.Series(list(x)))
        for _, i in enumerate(seqdata.columns.values):
            qseq = "".join(seqdata[i].tolist())
            sse = collections.Counter(qseq)
            data["H"].append(float(sse["H"]) / float(len(qseq)))
            data["E"].append(float(sse["E"]) / float(len(qseq)))
            data["L"].append(float(sse["L"]) / float(len(qseq)))

    elif isinstance(df, FragmentFrame):
        for i in df["position"].drop_duplicates().values:
            qseq = "".join(df[df["position"] == i]["sse"].values).upper()
            sse = collections.Counter(qseq)
            data["H"].append(float(sse["H"]) / float(len(qseq)))
            data["E"].append(float(sse["E"]) / float(len(qseq)))
            data["L"].append(float(sse["L"]) / float(len(qseq)))

    else:
        raise AttributeError(
            "Input data has to be a DesignFrame or a FragmentFrame.")

    dfo = pd.DataFrame(data)
    # Get shift only from DesignFrame; FragmentFrame does not have one
    shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1
    # Shift the index so that index == PDB count
    if isinstance(shft, int):
        dfo.index = dfo.index + shft
    else:
        dfo.index = shft
    return dfo.loc[list(get_selection(key_residues, seqID, list(dfo.index)))]
Beispiel #10
0
def positional_structural_identity(df,
                                   seqID=None,
                                   ref_sse=None,
                                   key_residues=None):
    """Per position evaluation of how many times the provided data matches the expected
    ``reference_structure``.

    :param df: |df_param|.
    :type df: Union[:class:`.DesignFrame`, :class:`.FragmentFrame`]
    :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame`
    :param str ref_sse: Reference sequence. Required when input is :class:`.FragmentFrame`.
        Will overwrite the reference sequence of :class:`.DesignFrame` if provided.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_types|

    :return: :class:`~pandas.DataFrame` - where rows are sequence positions and
        columns are ``sse`` (expected secondary structure),
        ``max_sse`` (most represented secondary structure) and
        ``identity_perc`` (percentage of matched secondary structure).

    :raises:
        :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`,
            :class:`.FragmentFrame`]. It will *not* try to cast a provided
            :class:`~pandas.DataFrame`, as it would not be possible to know into which of
            the two possible inputs it needs to be casted.
        :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided.
        :KeyError: |sseID_error| when input is :class:`.DesignFrame`.
        :AttributeError: if input is :class:`.FragmentFrame` and ``ref_sse`` is not provided.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.analysis import positional_structural_identity
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_ssebig.minisilent.gz",
           ...:                         {'scores': ['score'], 'structure': 'C'})
           ...: df.add_reference_structure('C', df.get_structure('C').values[0])
           ...: df = positional_structural_identity(df.iloc[1:], 'C')
           ...: df.head()
    """
    from rstoolbox.components import DesignFrame, FragmentFrame
    from rstoolbox.components import get_selection
    data = {"sse": [], "max_sse": [], "identity_perc": []}

    if isinstance(df, DesignFrame):
        if seqID is None:
            raise AttributeError("seqID needs to be provided")
        if not df.has_reference_structure(seqID):
            raise AttributeError(
                "There is no reference structure for seqID {}".format(seqID))
        if not "structure_{}".format(seqID) in df:
            raise KeyError("Structure {} not found in decoys.".format(seqID))
        ref_sse = ref_sse if ref_sse is not None else df.get_reference_structure(
            seqID)
        seqdata = df.get_structure(seqID)
        seqdata = seqdata.apply(lambda x: pd.Series(list(x)))
        for _, i in enumerate(seqdata.columns.values):
            qseq = "".join(seqdata[i].tolist())
            sse = collections.Counter(qseq)
            data["sse"].append(ref_sse[i])
            data["max_sse"].append(sse.most_common(1)[0][0])
            data["identity_perc"].append(
                float(sse[ref_sse[i - 1]]) / float(len(qseq)))

    elif isinstance(df, FragmentFrame):
        if ref_sse is None:
            raise AttributeError("ref_sse needs to be provided")

        for i in df["position"].drop_duplicates().values:
            qseq = "".join(df[df["position"] == i]["sse"].values).upper()
            sse = collections.Counter(qseq)
            data["sse"].append(ref_sse[i - 1])
            data["max_sse"].append(sse.most_common(1)[0][0])
            data["identity_perc"].append(
                float(sse[ref_sse[i - 1]]) / float(len(qseq)))

    else:
        raise AttributeError(
            "Input data has to be a DesignFrame with a reference sequence "
            "or a FragmentFrame.")

    dfo = pd.DataFrame(data)
    # Get shift only from DesignFrame; FragmentFrame does not have one
    shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1
    # Shift the index so that index == PDB count
    if isinstance(shft, int):
        dfo.index = dfo.index + shft
    else:
        dfo.index = shft
    return dfo.loc[list(get_selection(key_residues, seqID, list(dfo.index)))]
def logo_plot(df,
              seqID,
              refseq=True,
              key_residues=None,
              line_break=None,
              hight_prop=4,
              font_size=35,
              refplot=False,
              colors="WEBLOGO"):
    """Generates full figure classic **LOGO** plots.

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param bool refseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_param|
    :param int line_break: Force a line-change in the plot after n residues are plotted.
    :param int hight_prop: Hight proportion for each row of the plot.
    :param float font_size: Expected size of the axis font.
    :param bool refplot: When :data:`True`, it will reorder the residues in each position
        so that the reference residue will be on the bottom and setting a two-color scheme
        (provide a single color name in ``colors``) that allows to quickly identify the reference
        type in each position.
    :param colors: Colors to assign; it can be the name of a available color set or
        a dictionary with a color for each type. Available color schemes are: Weblogo
        (default), Hydrophobicity, Chemistry, and Charge.
    :type colors: Union[:class:`str`, :class:`dict`]

    :return: :class:`~matplotlib.figure.Figure` and
        :func:`list` of [:class:`~matplotlib.axes.Axes`, :class:`~matplotlib.axes.Axes`] -
        with primary and secondary axis of each subplot.

    .. seealso::
        :func:`.logo_plot_in_axis`

    .. rubric:: Example

    .. ipython::
        :okwarning:

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import logo_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: df.add_reference_sequence("B", df.get_sequence("B")[0])
           ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50)
           ...: plt.tight_layout()

        @savefig sequence_logo_plot_docs.png width=5in
        In [2]: plt.show()

        In [3]: plt.close()
    """

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)
    if data.empty:
        raise ValueError("Provided data container is empty. Nothing to plot.")

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersection(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)

    # key_residues management.
    length = len(data.get_reference_sequence(seqID)) if refseq else None
    key_residues = get_selection(key_residues, seqID, list(data.index.values),
                                 length)

    # Plot
    if line_break is None:
        figsize = (len(data) * 2, 2.3 * hight_prop)
        grid = (1, 1)
        fig = plt.figure(figsize=figsize)
        axs = [
            [plt.subplot2grid(grid, (0, 0)), None],
        ]
        krs = [
            key_residues,
        ]
    else:
        rows = int(math.ceil(float(len(key_residues)) / line_break))
        figsize = (float(len(data) * 2) / rows, 2.3 * hight_prop * rows)
        grid = (rows, 1)
        fig = plt.figure(figsize=figsize)
        axs = [[plt.subplot2grid(grid, (_, 0)), None] for _ in range(rows)]
        krs = list(_chunks(key_residues, line_break))

    font = FontProperties()
    font.set_size(font_size)
    font.set_weight('bold')

    for _, ax in enumerate(axs):

        axs[_][1] = logo_plot_in_axis(data,
                                      seqID,
                                      ax[0],
                                      refseq=refseq,
                                      key_residues=krs[_],
                                      refplot=refplot,
                                      colors=colors,
                                      line_break=line_break)
    return fig, axs
def positional_sequence_similarity(df,
                                   seqID=None,
                                   ref_seq=None,
                                   key_residues=None,
                                   matrix="BLOSUM62"):
    """Per position identity and similarity against a ``reference_sequence``.

    Provided a data container with a set of sequences, it will evaluate the percentage of
    identities and similarities that the whole set has against a ``reference_sequence``.
    It would do so by sequence position instead that by each individual sequence.

    In a way, this generates an extreme simplification from a :class:`.SequenceFrame`.

    :param df: |df_param|.
    :type df: Union[:class:`.DesignFrame`, :class:`.FragmentFrame`]
    :param str seqID: |seqID_param|. Required when input is :class:`.DesignFrame`.
    :param str ref_seq: Reference sequence. Required when input is :class:`.FragmentFrame`.
        Will overwrite the reference sequence of :class:`.DesignFrame` if provided.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_types|
    :param str matrix: |matrix_param|. Default is ``BLOSUM62``.


    :return: :class:`~pandas.DataFrame` - where rows are sequence positions and
        columns are ``identity_perc`` and ``positive_perc``.

    :raises:
        :AttributeError: if the data passed is not in Union[:class:`.DesignFrame`,
            :class:`.FragmentFrame`]. It will *not* try to cast a provided
            :class:`~pandas.DataFrame`, as it would not be possible to know into which of
            the two possible inputs it needs to be casted.
        :AttributeError: if input is :class:`.DesignFrame` and ``seqID`` is not provided.
        :KeyError: |seqID_error| when input is :class:`.DesignFrame`.
        :AttributeError: |reference_error| when input is :class:`.DesignFrame`.
        :AttributeError:  if input is :class:`.FragmentFrame` and ``ref_seq`` is not provided.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.analysis import positional_sequence_similarity
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: pd.set_option('display.max_columns', 500)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {'scores': ['score'], 'sequence': 'B'})
           ...: df.add_reference_sequence('B', df.get_sequence('B').values[0])
           ...: df = positional_sequence_similarity(df.iloc[1:], 'B')
           ...: df.head()
    """
    from rstoolbox.components import DesignFrame, FragmentFrame
    from rstoolbox.components import get_selection

    data = {"identity_perc": [], "positive_perc": []}
    # Get matrix data
    mat = SM.get_matrix(matrix)

    if isinstance(df, DesignFrame):
        if seqID is None:
            raise AttributeError("seqID needs to be provided")
        if not df.has_reference_sequence(seqID):
            raise AttributeError(
                "There is no reference sequence for seqID {}".format(seqID))
        if not "sequence_{}".format(seqID) in df:
            raise KeyError("Sequence {} not found in decoys.".format(seqID))

        ref_seq = ref_seq if ref_seq is not None else df.get_reference_sequence(
            seqID)
        seqdata = df.get_sequence(seqID)
        seqdata = seqdata.apply(lambda x: pd.Series(list(x)))
        for _, i in enumerate(seqdata.columns.values):
            qseq = "".join(seqdata[i].tolist())
            _, idn, pos, _ = _positional_similarity(qseq, ref_seq[_], mat)
            data["identity_perc"].append(float(idn) / float(len(qseq)))
            data["positive_perc"].append(float(pos) / float(len(qseq)))

    elif isinstance(df, FragmentFrame):
        if ref_seq is None:
            raise AttributeError("ref_seq needs to be provided")

        for i in df["position"].drop_duplicates().values:
            qseq = "".join(df[df["position"] == i]["aa"].values)
            _, idn, pos, _ = _positional_similarity(qseq, ref_seq[i - 1], mat)
            data["identity_perc"].append(float(idn) / float(len(qseq)))
            data["positive_perc"].append(float(pos) / float(len(qseq)))

    else:
        raise AttributeError("Input data has to be a DesignFrame with a "
                             "reference sequence or a FragmentFrame.")

    dfo = pd.DataFrame(data)
    # Get shift only from DesignFrame; FragmentFrame does not have one
    shft = df.get_reference_shift(seqID) if isinstance(df, DesignFrame) else 1
    # Shift the index so that index == PDB count
    if isinstance(shft, int):
        dfo.index = dfo.index + shft
    else:
        dfo.index = shft

    selection = list(get_selection(key_residues, seqID, list(dfo.index)))
    selection = [x - 1 for x in selection]  # -1 for array like count
    return dfo.iloc[selection]