Example #1
0
def random_proteins(size, count):
    """Generate random protein sequences.

    :param int size: Length of the sequences.
    :param int count: Number of sequences.

    :return: :class:`.DesignFrame`
    """
    from rstoolbox.components import DesignFrame

    def make_sequence(size):
        alphabet = list("ARNDCQEGHILKMFPSTWYV")
        return ''.join(choice(alphabet) for _ in range(size))

    df = DesignFrame({'description': ['decoy_{:04d}'.format(x + 1) for x in range(count)]})
    df['sequence_A'] = df.apply(lambda row: make_sequence(size), axis=1)
    return df
def positional_enrichment(df, other, seqID):
    """Calculates per-residue enrichment from sequences in the first :class:`.DesignFrame`
    with respect to the second.

    .. note::
        Position / AA type pairs present in ``df`` but not ``other`` will have a value of
        :data:`~np.inf`.

    :param df: |df_param|.
    :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param other: |df_param|.
    :type other: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param str seqID: |seqID_param|.

    :return: :class:`.FragmentFrame` - with enrichment percentages.

    :raises:
        :NotImplementedError: if the data passed is not in Union[:class:`.DesignFrame`,
            :class:`~pandas.DataFrame`].
        :KeyError: |seqID_error|.
    """
    from rstoolbox.components import DesignFrame

    for i, x in enumerate([df, other]):
        if not isinstance(x, DesignFrame):
            if not isinstance(x, pd.DataFrame):
                raise NotImplementedError('Unknow input format')
            else:
                if i == 0:
                    df = DesignFrame(df)
                else:
                    other = DesignFrame(other)
    result = df.sequence_frequencies(seqID) / other.sequence_frequencies(seqID)
    if df._reference == other._reference:
        result.transfer_reference(df)
    return result.replace(np.nan, 0)
Example #3
0
def generate_mutants_from_matrix( self, seqID, matrix, count,
                                  key_residues=None, limit_refseq=False ):
    """From a provided positional frequency matrix, generates ``count`` random variants.

    It takes into account the individual frequency assigned to each residue type and
    position. It does **not** generate the highest possible scored sequence according to
    the matrix, but picks randomly at each position according to the frequencies in for
    that position.

    For each :class:`.DesignSeries`, it will generate a :class:`.DesignFrame` in which the
    original sequence becomes the ``reference_sequence``, inheriting the ``reference_shift``.

    .. warning::
        This is a **computationaly expensive** function. Take this in consideration when trying
        to run it.

    Each :class:`.DesignFrame` will have the following structure:

    ======================  ============================================
    Column                                                Data Content
    ======================  ============================================
    **description**         Identifier fo the mutant
    **sequence_<seqID>**    Sequence content
    **pssm_score_<seqID>**  Score obtained by applying ``matrix``
    ======================  ============================================

    :param str seqID: |seqID_param|
    :param matrix: Positional frequency matrix. **column:** residue type; **index:**
        sequence position.
    :type matrix: :class:`~pandas.DataFrame`
    :param int count: Expected number of **unique** generated combinations. If the number is
        bigger than the possible options, it will default to the total amount of options.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_types|
    :param bool limit_refseq: When :data:`True`, pick only residue types with probabilities
        equal or higher to the source sequence.

    :return: :func:`list` of :class:`.DesignFrame` - New set of design sequences.

    :raises:
        :ValueError: if matrix rows do not match sequence length.

    .. seealso::
        :meth:`.DesignFrame.generate_mutant_variants`
        :meth:`.DesignFrame.score_by_pssm`
        :meth:`.DesignSeries.generate_mutant_variants`
        :meth:`.DesignSeries.score_by_pssm`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.tests.helper import random_frequency_matrix
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {'scores': ['score', 'description'], 'sequence': 'B'})
           ...: df.add_reference_sequence('B', df.get_sequence('B').values[0])
           ...: matrix = random_frequency_matrix(len(df.get_reference_sequence('B')), 0)
           ...: key_res = [3,5,8,12,15,19,25,27]
           ...: mutants = df.iloc[1].generate_mutants_from_matrix('B', matrix, 5, key_res)
           ...: mutants[0].identify_mutants('B')

    """
    from rstoolbox.components import get_selection
    from rstoolbox.components import DesignSeries, DesignFrame

    def max_options( matrix, seq, key_residues, limit_refseq):
        if limit_refseq is False:
            return np.power(20, len(key_residues))
        else:
            ori_index = matrix.index
            matrix = matrix.copy()
            matrix.index = range(0, matrix.shape[0])
            options = (matrix.apply(lambda row: np.sum(row >= row[seq[row.name]]), axis=1))
            options.index = ori_index
            return np.prod(options[key_residues])

    data = []
    if isinstance(self, pd.DataFrame):
        for _, row in self.iterrows():
            data.extend(row.generate_mutants_from_matrix(seqID, matrix, count,
                                                         key_residues, limit_refseq))
        return data

    if matrix.shape[0] != len(self.get_sequence(seqID)):
        raise ValueError("Matrix rows and sequence length should match.")
    # Make sure index and sequence shift match
    matrix = matrix.copy()
    shift = self.get_reference_shift(seqID)
    matrix.index = get_selection(None, seqID, shift, length=matrix.shape[0])

    if key_residues is not None:
        key_residues = get_selection(key_residues, seqID, shift, matrix.shape[0])
    else:
        key_residues = list(matrix.index.values)

    seqnm = "sequence_{}".format(seqID)
    data.append(DesignFrame([], columns=["description", seqnm]))
    name  = self.get_id()

    options = max_options(matrix, self.get_sequence(seqID), key_residues, limit_refseq)
    # some numbers are just too big for python...
    if options <= 0:
        options = count + 1

    while data[-1].shape[0] < min(count, options):
        seqaa = list(self.get_sequence(seqID))
        thisname = name + "_v{0:04d}".format(data[-1].shape[0] + 1)
        for aap in key_residues:
            matI = matrix.loc[aap].copy()
            if limit_refseq:
                matI[matI < matI[seqaa[aap - 1]]] = 0
                matI = matI / matI.sum()
            seqaa[aap - 1] = np.random.choice(matI.index.values, 1, p=list(matI))[0]
        if "".join(seqaa) == self.get_sequence(seqID):
            continue
        data[-1] = data[-1].append(DesignSeries([thisname, "".join(seqaa)],
                                                ["description", seqnm]),
                                   ignore_index=True)
        data[-1].drop_duplicates([seqnm])
    data[-1].add_reference(seqID, self.get_sequence(seqID), shift=self.get_reference_shift(seqID))
    data[-1] = data[-1].score_by_pssm(seqID, matrix)
    return data
Example #4
0
def apply_resfile( self, seqID, filename, rscript=None, keep_input_scores=False ):  # pragma: no cover
    """Apply a generated Rosetta `resfile
    <https://www.rosettacommons.org/docs/latest/rosetta_basics/file_types/resfiles>`_
    to the decoy.

    This function needs to be created after the appropiate mutant variants have been created
    and their corresponding **resfiles** have been written.

    .. note::
        Depends on :ref:`rosetta.path <options>` and :ref:`rosetta.compilation <options>`,
        if the ``filename`` does not exist.

    .. attention::
        This function **REQUIRES** a local installation of **Rosetta**.

    To execute this function it is important that the ``source_file`` assigned to the
    :class:`.DesignFrame` is an original silent file and **not a minisilent**, as the
    original structure of the decoy needs to be used in order to generate the variants.
    If that is not the case, use :class:`.DesignFrame.replace_source_files`.

    :param str seqID: |seqID_param|
    :param str filename: Name of the final silent file that will contain all the variant's data.
        If the file exists, it is assumed that the data was already created and data will be
        directly loaded from that file.
    :param str rscript: By default, the script executed will be the one generated by
        :func:`.mutations`. One can provide its own script (either as the file name of the
        script or as a string of the content itself) **as long as it fulfills two conditions**:
        (1) It must contain the **AddJobPairData Mover** and (2) it should accept the script
        variable ``resfile``. An example on how to use these two conditions can be extrapolated
        from :func:`.mutations`.
    :param bool keep_input_scores: When :data:`True` (default :data:`False`), it will keep the
        score terms present in the source decoy (as they appear in the original silent file)
        for the variants.

    :return: :class:`.DesignFrame` with the scores for the mutants.

    :raise:
        :SystemError: If all variants faile to be generated or if they cannot be merged.
        :IOError: If Rosetta path cannot be found.
        :AttributeError: If the resfiles for the variants were not previously created.

    .. seealso:
        :meth:`.DesignFrame.generate_mutant_variants`
        :meth:`.DesignFrame.generate_mutants_from_matrix`
        :meth:`.DesignFrame.generate_wt_reversions`
        :meth:`.DesignFrame.make_resfile`
        :meth:`.DesignSeries.generate_mutant_variants`
        :meth:`.DesignSeries.generate_mutants_from_matrix`
        :meth:`.DesignSeries.generate_wt_reversions`
        :meth:`.DesignSeries.make_resfile`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {'scores': ['score', 'description'], 'sequence': 'B'})
           ...: df.add_reference_sequence('B', df.get_sequence('B').values[0])
           ...: dfwt = df.iloc[0].generate_mutant_variants('B', [(1, "TGP"), (6, "ERG"),
           ...:                                                  (14, "MAT")])
           ...: # Call in test-mode
           ...: dfwt = dfwt.make_resfile("B", "NATAA", "mutants.resfile", write=False )
           ...: dfwt2 = dfwt.iloc[:3].apply_resfile("B",
           ...:                                     "../rstoolbox/tests/data/variants.silent.gz")
           ...: dfwt2
    """
    from rstoolbox.components import DesignSeries, DesignFrame
    from rstoolbox.io import parse_rosetta_file
    from rstoolbox.utils import mutations

    if isinstance(self, DesignSeries):
        self = DesignFrame(self).T

    resfile = 'resfile_{}'.format(seqID)
    if not os.path.isfile(filename):
        wdir = tempfile.mkdtemp()
        exe = make_rosetta_app_path('rosetta_scripts')
        if resfile not in self.columns:
            raise AttributeError("Resfiles are needed to execute this function.")
        if rscript is None:
            rscript = mutations(seqID)
        if not os.path.isfile(rscript):
            fd = open(os.path.join(wdir, 'script.xml'), 'w')
            fd.write(rscript)
            fd.close()
            rscript = os.path.join(wdir, 'script.xml')

        command = ['{0}', '-parser:protocol {1}', '-in:file:silent {2}', '-in:file:tags {3}',
                   '-out:file:silent {4}', '-parser:script_vars resfile={5}']
        if not keep_input_scores:
            command.append('-keep_input_scores false')
        command = ' '.join(command)
        outfiles = []
        errors = 0
        sys.stdout.write("Running Rosetta\n")
        for _, row in self.iterrows():
            if re.search(r'_v\d{4}$', row['description']):
                origin = "_".join(row['description'].split('_')[:-1])
            else:
                origin = row['description']
            outfiles.append(os.path.join(wdir, row['description'] + '.silent'))
            cmd = command.format(exe, rscript, " ".join(self.get_source_files()),
                                 origin, outfiles[-1], row[resfile])
            sys.stdout.write(cmd + "\n")
            error = execute_process( cmd )
            if bool(error):
                errors += 1
                sys.stdout.write("Execution for variant {} has failed\n".format(row['description']))

        if errors < self.shape[0]:
            exe = make_rosetta_app_path('combine_silent')
            command = ['{0}', '-in:file:silent {1}', '-out:file:silent {2}']
            command = ' '.join(command)
            cmd = command.format(exe, " ".join(outfiles), filename)
            sys.stdout.write("Merging all silent files\n")
            sys.stdout.write(cmd + "\n")
            error = execute_process( cmd )
            if bool(error):
                raise SystemError("A file with the new variants could not be created.")
        else:
            raise SystemError("All variants failed to be generated.")

    df = parse_rosetta_file(filename)
    df = df.drop(columns=['description'])
    return self.merge(df, on=resfile, how='left')
Example #5
0
def sequence_frequency_plot(df,
                            seqID,
                            ax,
                            aminosY=True,
                            clean_unused=-1,
                            refseq=True,
                            key_residues=None,
                            border_color="green",
                            border_width=2,
                            labelsize=None,
                            xrotation=0,
                            yrotation=0,
                            **kwargs):
    """Makes a heatmap subplot into the provided axis showing the sequence distribution
    of each residue type for each position.

    A part from the function arguments, any argument that can be provided to the
    :func:`seaborn.heatmap` function can also be provided here.

    By default, the heatmap generated will have the residue types as y-axis and the
    sequence positions as x-axis.

    Some tips:

    #. **Do you want to set the orientation of the color bar vertical?** \
        Add the parameter: ``cbar_kws={"orientation": "vertical"}``
    #. **Do you want to put the color bar in a different axis?** \
        This is quite recommendable, as the color bar in the same axis does not \
        tend to look that good. Add the parameter: ``cbar_ax=[second_axis]``
    #. **You don't want a color bar?** \
        Add the parameter: ``cbar=False``

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param ax: Where to plot the heatmap.
    :type ax: :class:`~matplotlib.axes.Axes`
    :param bool aminosY: Set to :data:`False` to get invert the orientation of the heatmap.
    :param float clean_unused: Remove amino acids from the plot when they never get represented
        over the given frequency. Residues present in the reference sequence are not taken
        into account.
    :param rbool efseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residue: |keyres_types|
    :param border_color: Color to use to mark the original residue types.
    :type border_color: Union[:class:`int`, :class:`str`]
    :param int border_width: Line width used to mark the original residue types.
    :param int labelsize: Change the size of the text in the axis.
    :param float xrotation: Rotation to apply in the x-axis text (degrees).
    :param float yrotation: Rotation to apply in the y-axis text (degrees).

    :raises:
        :ValueError: if input is not a :class:`~pandas.DataFrame` derived object.
        :KeyError: |reference_error|.

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import sequence_frequency_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: fig = plt.figure(figsize=(25, 10))
           ...: ax = plt.subplot2grid((1, 1), (0, 0))
           ...: sequence_frequency_plot(df, "B", ax, refseq=False, cbar=False, xrotation=90)

        @savefig sequence_frequency_plot_docs.png width=5in
        In [2]: plt.show()
    """

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)

    fp = FontProperties()
    fp.set_family("monospace")

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersction(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)
    if isinstance(data, SequenceFrame):
        order = sorted(data.columns.values.tolist(),
                       key=lambda x: order.index(x))
        if not data.is_transposed():
            data = data.transpose().reindex(order)
        else:
            data = data.reindex(order)

    # Refseq and key_residues management.
    ref_seq = data.get_reference_sequence(seqID,
                                          key_residues) if refseq else ""

    # data and key_residues management.
    data = data.get_key_residues(key_residues)

    if clean_unused >= 0:
        data.delete_empty(clean_unused)
        data = data.clean()
        order = sorted(data.index.values.tolist(),
                       key=lambda x: order.index(x))
        data = data.reindex(order)

    # heatmap parameters and others
    kwargs.setdefault("cmap", "Blues")  # define the color-range of the plot
    kwargs.setdefault("linewidths", 1)  # linewidths are fixed to 1
    kwargs.setdefault("square",
                      True)  # square is True if user don't say otherwise
    # by default the color bar is horizontal
    kwargs.setdefault("cbar_kws", {"orientation": "horizontal"})

    # plot
    if not aminosY:
        data = data.transpose()
    sns.heatmap(data, ax=ax, **kwargs)

    # styling plot
    # seaborn made a change in the ticks from 0.7 to 0.8,
    # this should take care that both versions work ok.
    if LooseVersion(sns.__version__) < LooseVersion("0.8"):
        order.reverse()
    if aminosY:
        ax.yaxis.set_ticks(np.arange(0.5, len(order) + 0.5))
        ax.yaxis.set_ticklabels(order, rotation=yrotation)
        for label in ax.get_yticklabels():
            label.set_fontproperties(fp)
        ax.xaxis.set_ticks(
            np.arange(0.5,
                      len(data.columns.values.tolist()) + 0.5))
        ax.xaxis.set_ticklabels(data.columns.values.tolist(),
                                rotation=xrotation)
        ax.set_ylabel("residue type")
        if labelsize is not None:
            ax.tick_params(labelsize=labelsize)
    else:
        ax.xaxis.set_ticks(np.arange(0.5, len(order) + 0.5))
        ax.xaxis.set_ticklabels(order, rotation=xrotation)
        for label in ax.get_xticklabels():
            label.set_fontproperties(fp)
        ax.yaxis.set_ticks(
            np.arange(0.5,
                      len(data.index.values.tolist()) + 0.5))
        ax.yaxis.set_ticklabels(data.index.values.tolist(), rotation=yrotation)
        ax.set_xlabel("residue type")
        if labelsize is not None:
            ax.tick_params(labelsize=labelsize)

    # marking reference sequence
    if ref_seq is not "" and refseq:
        if isinstance(border_color, int):
            border_color = sns.color_palette()[border_color]
        for i, aa in enumerate(ref_seq):
            if aminosY:
                aa_position = (i, order.index(aa))
            else:
                aa_position = (order.index(aa), i)
            ax.add_patch(
                Rectangle(aa_position,
                          1,
                          1,
                          fill=False,
                          clip_on=False,
                          edgecolor=border_color,
                          lw=border_width,
                          zorder=100))
Example #6
0
def logo_plot(df,
              seqID,
              refseq=True,
              key_residues=None,
              line_break=None,
              font_size=35,
              colors="WEBLOGO"):
    """Generates classic **LOGO** plots.

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param bool refseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residue: |keyres_param|
    :param int line_break: Force a line-change in the plot after n residues are plotted.
    :param float font_size: Expected size of the axis font.
    :param colors: Colors to assign; it can be the name of a available color set or
        a dictionary with a color for each type.
    :type colors: Union[:class:`str`, :class:`dict`]

    :return: :class:`~matplotlib.figure.Figure` and
        :func:`list` of :class:`~matplotlib.axes.Axes`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import logo_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: df.add_reference_sequence("B", df.get_sequence("B")[0])
           ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50)
           ...: plt.tight_layout()

        @savefig sequence_logo_plot_docs.png width=5in
        In [2]: plt.show()
    """
    def _letterAt(letter,
                  x,
                  y,
                  yscale=1,
                  ax=None,
                  globscale=1.35,
                  LETTERS=None,
                  COLOR_SCHEME=None):
        text = LETTERS[letter]
        t = mpl.transforms.Affine2D().scale(1 * globscale, yscale * globscale) + \
            mpl.transforms.Affine2D().translate(x, y) + ax.transData
        p = PathPatch(text, lw=0, fc=COLOR_SCHEME[letter], transform=t)
        if ax is not None:
            ax.add_artist(p)
        return p

    def _dataframe2logo(data):
        aa = list(data)
        odata = []
        for _, pos in data.iterrows():
            pdata = []
            for k in aa:
                if pos[k] > 0.0000000:
                    pdata.append((k, float(pos[k])))
            odata.append(sorted(pdata, key=operator.itemgetter(1, 0)))
        return odata

    def _chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)

    mpl.rcParams['svg.fonttype'] = 'none'
    # Graphical Properties of resizable letters
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                        '../components/square.ttf')
    fp = FontProperties(fname=path, weight="bold")
    globscale = 1.22
    letters_shift = -0.5
    LETTERS = {}
    for aa in color_scheme(colors):
        LETTERS[aa] = TextPath((letters_shift, 0), aa, size=1, prop=fp)

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersection(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)

    # key_residues management.
    length = len(data.get_reference_sequence(seqID)) if refseq else None
    key_residues = get_selection(key_residues, seqID, list(data.index.values),
                                 length)

    # Plot
    if line_break is None:
        figsize = (len(data) * 2, 2.3 * 2)
        grid = (1, 1)
        fig = plt.figure(figsize=figsize)
        axs = [
            plt.subplot2grid(grid, (0, 0)),
        ]
        krs = [
            key_residues,
        ]
    else:
        rows = int(math.ceil(float(len(data)) / line_break))
        figsize = (float(len(data) * 2) / rows, 2.3 * 2 * rows)
        grid = (rows, 1)
        fig = plt.figure(figsize=figsize)
        axs = [plt.subplot2grid(grid, (_, 0)) for _ in range(rows)]
        krs = list(_chunks(key_residues, line_break))

    font = FontProperties()
    font.set_size(font_size)
    font.set_weight('bold')

    for _, ax in enumerate(axs):
        # Refseq and key_residues management.
        ref_seq = data.get_reference_sequence(seqID, krs[_]) if refseq else ""
        # data and key_residues management.
        _data = data.get_key_residues(krs[_])

        maxv = int(math.ceil(data.max_hight()))

        ticks = len(_data)
        if line_break is not None and len(_data) < line_break:
            ticks = line_break
        ax.set_xticks(np.arange(0.5, ticks + 1))
        ax.set_yticks(range(0, maxv + 1))
        ax.set_xticklabels(_data.index.values)
        ax.set_yticklabels(np.arange(0, maxv + 1, 1))
        if ref_seq is not None:
            ax2 = ax.twiny()
            ax2.set_xticks(ax.get_xticks())
            ax2.set_xticklabels(list(ref_seq))
        sns.despine(ax=ax, trim=True)
        ax.grid(False)
        if ref_seq is not None:
            sns.despine(ax=ax2, top=False, right=True, left=True, trim=True)
            ax2.grid(False)
        ax.lines = []
        wdata = _dataframe2logo(_data)
        x = 0.5
        maxi = 0
        for scores in wdata:
            y = 0
            for base, score in scores:
                _letterAt(base, x, y, score, ax, globscale, LETTERS,
                          color_scheme(colors))
                y += score
            x += 1
            maxi = max(maxi, y)
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontproperties(font)
        if ref_seq is not None:
            for label in (ax2.get_xticklabels() + ax2.get_yticklabels()):
                label.set_fontproperties(font)

    return fig, axs
Example #7
0
def random_fastq(sequence, description, selection, btags, num_seqs, min_repeat,
                 max_repeat, num_files, prefix):
    """Generate a requested number of fastq files.

    :param str sequence: Starting protein sequence.
    :param str description: Name of the sequence.
    :param str selection: Region of the sequence to keep untouched.
    :param btags: Protein sequence border tags.
    :type btags: :func:`list` of :class:`str`
    :param int num_seqs: Number of individual sequences to generate.
    :param int min_repeat: Minimum number of repetitions per sequence.
    :param int max_repeat: Maximum number of repetitions per sequence.
    :param int num_files: Number of files to generate.
    :param str prefix: Prefixes for the files.

    :return: :func:`list` of :class:`str` - filenames generated
    """
    codontable = {
        'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
        'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
        'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
        'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
        'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
        'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
        'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
        'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
        'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
        'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
        'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W'}
    revtable = {}
    for c in codontable:
        revtable.setdefault(codontable[c], []).append(c)

    def rtranslate(seq, tab):
        return "".join([tab[_][randint(0, len(tab[_]) - 1)] for _ in seq])

    def add_tag(row, start, stop):
        return start + row['dna_A'] + stop if row.name % 2 == 0 else row['dna_A']

    # Create DesignFrame
    df = DesignFrame({'description': [description], 'sequence_A': sequence})
    df.add_reference_sequence('A', df.get_sequence('A').values[0])

    # Create Random Sequences; keep a segment as MOTIF
    mtx = random_frequency_matrix(len(df.get_reference_sequence('A')), 0)
    sel = Selection(selection)
    mutants = df.iloc[0].generate_mutants_from_matrix('A', mtx, num_seqs, ~sel)[0]

    # Generate DNA
    mutants['dna_A'] = mutants.apply(lambda row: rtranslate(row['sequence_A'], revtable), axis=1)

    # Add a start and stop tags only to pair sequences
    start = "".join([revtable[_][randint(0, len(revtable[_]) - 1)] for _ in btags[0]])
    stop  = "".join([revtable[_][randint(0, len(revtable[_]) - 1)] for _ in btags[1]])

    filename = prefix + '_{:03d}.fasq'
    all_files = []
    for i in range(num_files):
        fname = filename.format(i + 1)
        all_files.append(fname)
        # Add weights and repeat
        mut = mutants.copy()
        mut['weight'] = [randint(1, 5) for _ in range(mut.shape[0])]
        mut = mut.loc[mut.index.repeat(mut.weight)].sample(frac=1).reset_index(drop=True)
        # Add border tags
        mut['dna_A'] = mut.apply(lambda row: add_tag(row, start, stop), axis=1)
        # Write file
        with open(fname, 'w') as fd:
            for index, row in mut.iterrows():
                fd.write('@{};TEST_MAKEUP\n'.format(row['description']))
                fd.write(row['dna_A'] + '\n')
                fd.write('+\n')
                fd.write('?A@EC?C@AC=B>A@??DEC?EEC@C@DDD:\n')
    return all_files
def logo_plot_in_axis(df,
                      seqID,
                      ax,
                      refseq=True,
                      key_residues=None,
                      refplot=False,
                      colors="WEBLOGO",
                      **kwargs):
    """Generates classic **LOGO** plot in a given axis.

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param bool refseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_param|
    :param bool refplot: When :data:`True`, it will reorder the residues in each position
        so that the reference residue will be on the bottom and setting a two-color scheme
        (provide a single color name in ``colors``) that allows to quickly identify the reference
        type in each position.
    :param colors: Colors to assign; it can be the name of a available color set or
        a dictionary with a color for each type. Available color schemes are: Weblogo
        (default), Hydrophobicity, Chemistry, and Charge.
    :type colors: Union[:class:`str`, :class:`dict`]

    :return: :class:`~matplotlib.axes.Axes` - secondary axis

    .. seealso::
        :func:`.logo_plot`
    """

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)
    if data.empty:
        raise ValueError("Provided data container is empty. Nothing to plot.")

    mpl.rcParams['svg.fonttype'] = 'none'
    # Graphical Properties of resizable letters
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                        '../components/square.ttf')
    fp = FontProperties(fname=path, weight="bold")
    globscale = 1.22
    letters_shift = -0.5
    LETTERS = {}
    if isinstance(colors, dict):
        for aa in colors:
            LETTERS[aa] = TextPath((letters_shift, 0), aa, size=1, prop=fp)
    elif isinstance(colors, str):
        for aa in color_scheme(colors):
            LETTERS[aa] = TextPath((letters_shift, 0), aa, size=1, prop=fp)
    else:
        raise ValueError(
            "Colors need to either be a string representing the name of a available "
            "color set or a dictionary with a color for each type.")

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersection(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)

    # Refseq and key_residues management.
    ref_seq = data.get_reference_sequence(seqID,
                                          key_residues) if refseq else ""
    # data and key_residues management.
    _data = data.get_key_residues(key_residues)

    maxv = int(math.ceil(data.max_hight()))

    ticks = len(_data)
    # This is applied if it comes from the logo_plot function
    if 'line_break' in kwargs and kwargs['line_break'] is not None:
        if ticks < kwargs['line_break']:
            ticks = kwargs['line_break']
    ax.set_xticks(np.arange(0.5, ticks + 1))
    ax.set_yticks(range(0, maxv + 1))
    ax.set_xticklabels(_data.index.values)
    ax.set_yticklabels(np.arange(0, maxv + 1, 1))
    ax.set_xlim(-0.1, ticks + 0.1)
    ax2 = None
    if ref_seq is not None:
        ax2 = ax.twiny()
        ax2.set_xticks(ax.get_xticks())
        ax2.set_xticklabels(list(ref_seq))
        ax2.set_xlim(-0.1, ticks + 0.1)
    sns.despine(ax=ax, trim=True)
    ax.grid(False)
    if ref_seq is not None:
        sns.despine(ax=ax2, top=False, right=True, left=True, trim=True)
        ax2.grid(False)
    ax.lines = []
    wdata = _dataframe2logo(_data)
    x = 0.5
    maxi = 0
    for scores in wdata:
        y = 0
        for base, score in scores:
            if isinstance(colors, dict):
                _letterAt(base, x, y, score, ax, globscale, LETTERS, colors)
            else:
                _letterAt(base, x, y, score, ax, globscale, LETTERS,
                          color_scheme(colors))
            y += score
        x += 1
        maxi = max(maxi, y)

    return ax2
def logo_plot(df,
              seqID,
              refseq=True,
              key_residues=None,
              line_break=None,
              hight_prop=4,
              font_size=35,
              refplot=False,
              colors="WEBLOGO"):
    """Generates full figure classic **LOGO** plots.

    :param df: Data container.
    :type df: Union[:class:`.DesignFrame`, :class:`.SequenceFrame`]
    :param str seqID: |seqID_param|.
    :param bool refseq: if :data:`True` (default), mark the original residues according to
        the reference sequence.
    :param key_residues: |keyres_param|.
    :type key_residues: |keyres_param|
    :param int line_break: Force a line-change in the plot after n residues are plotted.
    :param int hight_prop: Hight proportion for each row of the plot.
    :param float font_size: Expected size of the axis font.
    :param bool refplot: When :data:`True`, it will reorder the residues in each position
        so that the reference residue will be on the bottom and setting a two-color scheme
        (provide a single color name in ``colors``) that allows to quickly identify the reference
        type in each position.
    :param colors: Colors to assign; it can be the name of a available color set or
        a dictionary with a color for each type. Available color schemes are: Weblogo
        (default), Hydrophobicity, Chemistry, and Charge.
    :type colors: Union[:class:`str`, :class:`dict`]

    :return: :class:`~matplotlib.figure.Figure` and
        :func:`list` of [:class:`~matplotlib.axes.Axes`, :class:`~matplotlib.axes.Axes`] -
        with primary and secondary axis of each subplot.

    .. seealso::
        :func:`.logo_plot_in_axis`

    .. rubric:: Example

    .. ipython::
        :okwarning:

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.plot import logo_plot
           ...: import matplotlib.pyplot as plt
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {"sequence": "B"})
           ...: df.add_reference_sequence("B", df.get_sequence("B")[0])
           ...: fig, axes = logo_plot(df, "B", refseq=True, line_break=50)
           ...: plt.tight_layout()

        @savefig sequence_logo_plot_docs.png width=5in
        In [2]: plt.show()

        In [3]: plt.close()
    """

    order = [
        "A", "V", "I", "L", "M", "F", "Y", "W", "S", "T", "N", "Q", "R", "H",
        "K", "D", "E", "C", "G", "P"
    ]
    data = copy.deepcopy(df)
    if data.empty:
        raise ValueError("Provided data container is empty. Nothing to plot.")

    # Data type management.
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            "Input data must be in a DataFrame, DesignFrame or SequenceFrame")
    else:
        if not isinstance(data, (DesignFrame, SequenceFrame)):
            if len(set(data.columns.values).intersection(
                    set(order))) == len(order):
                data = SequenceFrame(data)
            else:
                data = DesignFrame(data)
    if isinstance(data, DesignFrame):
        data = data.sequence_frequencies(seqID)

    # key_residues management.
    length = len(data.get_reference_sequence(seqID)) if refseq else None
    key_residues = get_selection(key_residues, seqID, list(data.index.values),
                                 length)

    # Plot
    if line_break is None:
        figsize = (len(data) * 2, 2.3 * hight_prop)
        grid = (1, 1)
        fig = plt.figure(figsize=figsize)
        axs = [
            [plt.subplot2grid(grid, (0, 0)), None],
        ]
        krs = [
            key_residues,
        ]
    else:
        rows = int(math.ceil(float(len(key_residues)) / line_break))
        figsize = (float(len(data) * 2) / rows, 2.3 * hight_prop * rows)
        grid = (rows, 1)
        fig = plt.figure(figsize=figsize)
        axs = [[plt.subplot2grid(grid, (_, 0)), None] for _ in range(rows)]
        krs = list(_chunks(key_residues, line_break))

    font = FontProperties()
    font.set_size(font_size)
    font.set_weight('bold')

    for _, ax in enumerate(axs):

        axs[_][1] = logo_plot_in_axis(data,
                                      seqID,
                                      ax[0],
                                      refseq=refseq,
                                      key_residues=krs[_],
                                      refplot=refplot,
                                      colors=colors,
                                      line_break=line_break)
    return fig, axs
Example #10
0
def sequencing_enrichment(indata,
                          enrichment=None,
                          bounds=None,
                          matches=None,
                          seqID='A'):
    """Retrieve data from multiple
    `NGS <https://www.wikiwand.com/en/DNA_sequencing#/Next-generation_methods>`_ files.

    Allows to obtain data from multiple files while ataching them to two conditions, a primary one
    (key1) and a secondary one (key2).

    For instance, let's assume that one has data obtained through selection of sequences by two
    different binders and three different concentration of binder each; we would define a
    ``indata`` dictionary such as::

        {'binder1': {'conc1': 'file1.fastq', 'conc2': 'file2.fastq', 'conc3': 'file3.fastq'},
         'binder2': {'conc1': 'file4.fastq', 'conc2': 'file5.fastq', 'conc3': 'file6.fastq'}}

    Also, for each binder we could decide to calculate the enrichment between any two
    concentrations; we can do that by defining a ``enrichment`` dictionary such as::

        {'binder1': ['conc1', 'conc3'],
         'binder2': ['conc1', 'conc3']}

    :param dict indata: First key is binder, second key is concentration, value is fastq file.
    :param dict enrichment: Key is binder, value is list of two concentrations (min,max)
        to calculate enrichment.
    :param bounds: N and C limit of the sequences. Follow the logic of :func:`adapt_length`
        with ``inclusive`` as :data:`False`.
    :type bounds: :func:`list` of :class:`str`
    :param matches: Sequence pattern to match. Follows the same logic as in
        :func:`.translate_3frames`.
    :type matches: :func:`list` of :class:`str`
    :return: :class:`.DesignFrame` with the sequences, counts (sequence) per fastq file and
        enrichment per binder (if requested).

    .. rubric:: Example

    (We skip printing the sequence column to ease visibility of the differences)

    .. ipython::

        In [1]: from rstoolbox.io import read_fastq
           ...: from rstoolbox.utils import sequencing_enrichment
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: pd.set_option('display.max_columns', 20)
           ...: indat = {'binder1': {'conc1': '../rstoolbox/tests/data/cdk2_rand_001.fasq.gz',
           ...:                      'conc2': '../rstoolbox/tests/data/cdk2_rand_002.fasq.gz',
           ...:                      'conc3': '../rstoolbox/tests/data/cdk2_rand_003.fasq.gz'},
           ...:          'binder2': {'conc1': '../rstoolbox/tests/data/cdk2_rand_004.fasq.gz',
           ...:                      'conc2': '../rstoolbox/tests/data/cdk2_rand_005.fasq.gz',
           ...:                      'conc3': '../rstoolbox/tests/data/cdk2_rand_006.fasq.gz'}}
           ...: df = sequencing_enrichment(indat)
           ...: df[[_ for _ in df.columns if _ != 'sequence_A']].head()

        In [1]: enrich = {'binder1': ['conc1', 'conc3'],
           ...:           'binder2': ['conc1', 'conc3']}
           ...: df = sequencing_enrichment(indat, enrich)
           ...: df[[_ for _ in df.columns if _ != 'sequence_A']].head()

    """
    from rstoolbox.components import DesignFrame

    def condition_reader(jobid, filename, bounds, matches):
        from rstoolbox.io import read_fastq
        df = read_fastq(filename)
        df['sequence_A'] = df.apply(
            lambda row: translate_3frames(row['sequence_A'], matches), axis=1)
        if bounds is not None:
            df['sequence_A'] = adapt_length(df['sequence_A'].values, bounds[0],
                                            bounds[1])

        df = df.merge(
            df.groupby('sequence_A').agg('count').reset_index(),
            on='sequence_A',
            how='left').drop_duplicates('sequence_A').reset_index(drop=True)

        df.rename(columns={
            'description_x': 'description',
            'description_y': jobid
        },
                  inplace=True)
        return df.sort_values(jobid, ascending=False)

    def binder_reader(jobid, inputb, bounds, matches):
        data = []
        for cond in inputb:
            data.append(
                condition_reader(jobid + '_' + cond, inputb[cond], bounds,
                                 matches))
        df = reduce(
            lambda left, right: pd.merge(
                left, right, on='sequence_A', how='outer'), data).fillna(0)
        return df

    data = []
    for binder in indata:
        data.append(binder_reader(binder, indata[binder], bounds, matches))
    df = reduce(
        lambda left, right: pd.merge(left, right, on='sequence_A', how='outer'
                                     ), data).fillna(0)
    df['len'] = df.apply(lambda row: len(row['sequence_A']), axis=1)
    df = df.drop([_ for _ in df.columns if _.startswith('description')],
                 axis=1)

    if enrichment is not None:
        for binder in enrichment:
            eb = enrichment[binder]
            id1 = '{0}_{1}'.format(binder, eb[0])
            id2 = '{0}_{1}'.format(binder, eb[1])
            df['enrichment_{}'.format(binder)] = df[id1] / df[id2]
    df = df.replace({np.inf: -1, -np.inf: -1}).fillna(0)
    designf = DesignFrame(
        df.rename(columns={'sequence_A': 'sequence_{}'.format(seqID)}))
    designf = designf.reset_index().rename(columns={'index': 'description'})
    return designf
Example #11
0
def sequential_frequencies(df,
                           seqID,
                           query="sequence",
                           seqType="protein",
                           cleanExtra=True,
                           cleanUnused=-1):
    """Generates a :class:`.SequenceFrame` for the frequencies of the sequences in the
    :class:`.DesignFrame` with ``seqID`` identifier.

    If there is a ``reference_sequence`` for this ``seqID``, it will also
    be attached to the :class:`.SequenceFrame`.

    All letters in the sequence will be capitalized. All symbols that
    do not belong to ``string.ascii_uppercase`` will be transformed to `"*"`
    as this is the symbol recognized by the substitution matrices as ``gap``.

    This function is directly accessible through some :class:`.DesignFrame` methods.

    :param df: |df_param|.
    :type df: Union[:class:`.DesignFrame`, :class:`~pandas.DataFrame`]
    :param str seqID: |seqID_param|.
    :param str query: |query_param|.
    :param str seqType: |seqType_param| and ``protein_sse``.
    :param bool cleanExtra: |cleanExtra_param|.
    :param float cleanUnused: |cleanUnused_param|.

    :return: :class:`.SequenceFrame`

    .. seealso::
        :meth:`.DesignFrame.sequence_frequencies`
        :meth:`.DesignFrame.sequence_bits`
        :meth:`.DesignFrame.structure_frequencies`
        :meth:`.DesignFrame.structure_bits`

    .. rubric:: Example

    .. ipython::

        In [1]: from rstoolbox.io import parse_rosetta_file
           ...: from rstoolbox.analysis import sequential_frequencies
           ...: import pandas as pd
           ...: pd.set_option('display.width', 1000)
           ...: pd.set_option('display.max_columns', 500)
           ...: df = parse_rosetta_file("../rstoolbox/tests/data/input_2seq.minisilent.gz",
           ...:                         {'scores': ['score'], 'sequence': 'AB'})
           ...: df = sequential_frequencies(df, 'B')
           ...: df.head()
    """
    from rstoolbox.components import SequenceFrame

    def count_instances(seq, table):
        t = copy.deepcopy(table)
        c = collections.Counter(seq)
        for aa in table:
            _ = c[aa]
            if _ > 0:
                t[aa] = float(_) / len(seq)
            else:
                t[aa] = 0
        return t

    # Cast if possible, so that we can access the different methods of DesignFrame
    if df._subtyp != 'design_frame' and isinstance(df, pd.DataFrame):
        from rstoolbox.components import DesignFrame
        df = DesignFrame(df)

    # Get all sequences; exclude empty ones (might happen) and uppercase all residues.
    sserie = df.get_sequential_data(query, seqID).replace(
        '', np.nan).dropna().str.upper()
    # Get the table to fill
    table, extra = _get_sequential_table(seqType)
    # Fill the table with the frequencies
    sserie = sserie.apply(lambda x: pd.Series(list(x)))
    sserie = sserie.apply(
        lambda x: pd.Series(count_instances(x.str.cat(), table))).T

    # Create the SequenceFrame
    dfo = SequenceFrame(sserie)
    dfo.measure("frequency")
    dfo.extras(extra)
    # Attach the reference sequence if there is any
    if df.has_reference_sequence(seqID):
        dfo.add_reference(seqID,
                          sequence=df.get_reference_sequence(seqID),
                          shift=df.get_reference_shift(seqID))
    dfo.delete_extra(cleanExtra)
    dfo.delete_empty(cleanUnused)
    dfo.clean()
    shft = df.get_reference_shift(seqID)
    # Shift the index so that the index of the SequenceFrame == PDB count
    if isinstance(shft, int):
        dfo.index = dfo.index + shft
    else:
        dfo.index = shft
    return dfo