Exemple #1
0
def plot_mutation_matrix(source,
                         mutant_column="mutant",
                         effect_column="prediction_epistatic",
                         conservation_column="column_conservation",
                         order=AA_LIST_PROPERTY,
                         min_value=None,
                         max_value=None,
                         min_percentile=None,
                         max_percentile=None,
                         show_conservation=False,
                         secondary_structure=None,
                         engine="mpl",
                         **matrix_style):
    """
    Plot a single-substitution mutation matrix

    Parameters
    ----------
    source : evcouplings.couplings.CouplingsModel or pandas.DataFrame
        Plot single mutation matrix predicted using CouplingsModel,
        or effect data for single mutations DataFrame
    mutant_column : str, optional (default: "mutant")
        If using source dataframe, extract single mutations from this column.
        Mutations have to be in format A100V.
    effect_column : str, optional (default: "prediction_epistatic")
        If using source dataframe, extract mutation effect from this column.
        Effects must be numeric.
    conservation_column : str, optional (default: "column_conservation")
        If using source dataframe, extract column conservation information
        from this column. Conservation values must be between 0 and 1. To
        plot conservation, set show_conservation=True.
    order : str or list, optional (default: AA_LIST_PROPERTY)
        Reorder y-axis (substitutions) according to this parameter. If None,
        substitutions will be inferred from source, and sorted alphabetically
        if source is a DataFrame.
    min_value : float, optional (default: None)
        Threshold colormap at this minimum value. If None, defaults to
        minimum value in matrix; if max_value is also None, defaults to
        -max(abs(matrix))
    max_value : float, optional (default: None)
        Threshold colormap at this maximum value. If None, defaults to
        maximum value in matrix; if min_value is also None, defaults to
        max(abs(matrix))
    min_percentile : int or float, optional (default: None)
        Set min_value to this percentile of the effect distribution. Overrides
        min_value.
    max_percentile : int or float, optional (default: None)
        Set max_value to this percentile of the effect distribution. Overrides
        max_value.
    show_conservation : bool, optional (default: False)
        Plot positional conservation underneath matrix. Only possible for
        engine == "mpl".
    secondary_structure : dict or pd.DataFrame
        Secondary structure to plot above matrix.
        Can be a dictionary of position (int) to
        secondary structure character ("H", "E", "-"/"C"),
        or a DataFrame with columns "id" and "sec_struct_3state"
        (as returned by Chain.residues, and DistanceMap.residues_i
        and DistanceMap.residues_j). Only supported by engine == "mpl".
    engine : {"mpl", "bokeh"}
        Plot matrix using matplotlib (static, more visualization options)
        or with bokeh (interactive, less visualization options)
    **matrix_style : kwargs
        Will be passed on to matrix_base_mpl or matrix_base_bokeh as kwargs

    Returns
    -------
    matplotlib AxesSuplot or bokeh Figure
        Figure/Axes object. Display bokeh figure using show().
    """
    def _extract_secstruct(secondary_structure):
        """
        Extract secondary structure for plotting functions
        """
        # turn into dictionary representation if
        # passed as a DataFrame
        if isinstance(secondary_structure, pd.DataFrame):
            secondary_structure = dict(
                zip(secondary_structure.id.astype(int),
                    secondary_structure.sec_struct_3state))

        # make sure we only retain secondary structure
        # inside the range of the mutation matrix
        secondary_structure = {
            i: sstr
            for (i, sstr) in secondary_structure.items() if i in positions
        }

        secstruct_str = "".join(
            [secondary_structure.get(i, "-") for i in positions])

        return secstruct_str

    conservation = None

    # test if we will extract information from CouplingsModel,
    # or from a dataframe with mutations
    if isinstance(source, CouplingsModel):
        matrix = source.smm()
        positions = source.index_list
        substitutions = source.alphabet
        wildtype_sequence = source.seq()

        if show_conservation:
            conservation = entropy_vector(source)
    else:
        # extract position, WT and subs for each mutant, and keep singles only
        source = split_mutants(source,
                               mutant_column).query("num_mutations == 1")

        # turn positions into numbers (may be strings)
        source.loc[:, "pos"] = pd.to_numeric(source.loc[:, "pos"]).astype(int)

        # same for effects, ensure they are numeric
        source.loc[:, effect_column] = pd.to_numeric(source.loc[:,
                                                                effect_column],
                                                     errors="coerce")

        substitutions = sorted(source.subs.unique())

        # group dataframe to get positional information
        source_grp = source.groupby("pos").first().reset_index().sort_values(
            by="pos")
        positions = source_grp.pos.values
        wildtype_sequence = source_grp.wt.values

        if show_conservation:
            source_grp.loc[:, conservation_column] = pd.to_numeric(
                source_grp.loc[:, conservation_column], errors="coerce")
            conservation = source_grp.loc[:, conservation_column].values

        # create mutation effect matrix
        matrix = np.full((len(positions), len(substitutions)), np.nan)

        # mapping from position/substitution into matrix
        pos_to_i = {p: i for i, p in enumerate(positions)}
        subs_to_j = {s: j for j, s in enumerate(substitutions)}

        # fill matrix with values
        for idx, r in source.iterrows():
            matrix[pos_to_i[r["pos"]], subs_to_j[r["subs"]]] = r[effect_column]

    # reorder substitutions
    if order is not None:
        matrix_final = np.full((len(positions), len(substitutions)), np.nan)
        substitutions_list = list(substitutions)

        # go through new order row by row and put in right place
        for i, subs in enumerate(order):
            if subs in substitutions:
                matrix_final[:, i] = matrix[:, substitutions_list.index(subs)]

        # set substitutions to new list
        substitutions = list(order)
    else:
        matrix_final = matrix

    # determine ranges for matrix colormaps
    # get effects without NaNs
    effects = matrix_final.ravel()
    effects = effects[np.isfinite(effects)]

    if min_percentile is not None:
        min_value = np.percentile(effects, min_percentile)

    if max_percentile is not None:
        max_value = np.percentile(effects, max_percentile)

    matrix_style["min_value"] = min_value
    matrix_style["max_value"] = max_value

    # extract secondary structure
    if secondary_structure is not None:
        secondary_structure_str = _extract_secstruct(secondary_structure)
    else:
        secondary_structure_str = None

    if engine == "mpl":
        return matrix_base_mpl(matrix_final,
                               positions,
                               substitutions,
                               conservation=conservation,
                               wildtype_sequence=wildtype_sequence,
                               secondary_structure=secondary_structure_str,
                               **matrix_style)
    elif engine == "bokeh":
        # cannot pass conservation for bokeh
        return matrix_base_bokeh(matrix_final,
                                 positions,
                                 substitutions,
                                 wildtype_sequence=wildtype_sequence,
                                 **matrix_style)
    else:
        raise ValueError(
            "Invalid plotting engine selected, valid options are: "
            "mpl, bokeh")
Exemple #2
0
def mutation_pymol_script(mutation_table,
                          output_file,
                          effect_column="prediction_epistatic",
                          mutant_column="mutant",
                          agg_func="mean",
                          cmap=plt.cm.RdBu_r,
                          segment_to_chain_mapping=None):
    """
    Create a Pymol .pml script to visualize single mutation
    effects

    Parameters
    ----------
    mutation_table : pandas.DataFrame
        Table with mutation effects (will be filtered
        for single mutants)
    output_file : str
        File path where to store pml script
    effect_column : str, optional (default: "prediction_epistatic")
        Column in mutation_table that contains mutation effects
    mutant_column : str, optional (default: "mutant")
        Column in mutation_table that contains mutations
        (in format "A123G")
    agg_func : str, optional (default: "mean")
        Function used to aggregate single mutations into one
        aggregated effect per position (any pandas aggregation
        operation, including "mean", "min, "max")
    cmap : matplotlib.colors.LinearSegmentedColormap, optional
            (default: plt.cm.RdBu_r)
        Colormap used to map mutation effects to colors
    segment_to_chain_mapping: str or dict(str -> str), optional (default: None)
        PDB chain(s) that should be targeted by line drawing

        * If None, residues will be selected
          py position alone, which may cause wrong assignments
          if multiple chains are present in the structure.

        * Different chains can be assigned for position
          if a dictionary that maps from segment (str) to PDB chain (str)
          is given.

    Raises
    ------
    ValueError
        If no single mutants contained in mutation_table
    ValueError
        If mutation_table contains a segment identifier not
        found in segment_to_chain_mapping
    """
    # split mutation strings
    t = split_mutants(mutation_table, mutant_column)

    # only pick single mutants
    t = t.query("num_mutations == 1")

    if len(t) == 0:
        raise ValueError("mutation_table does not contain any single "
                         "amino acid substitutions.")

    # add a segment column if missing
    if "segment" not in t.columns:
        t.loc[:, "segment"] = None

    with open(output_file, "w") as f:

        #handle each segment independently
        # have to fill NaNs with a string for groupby to work
        t = t.fillna("none")
        for segment_name, _t in t.groupby("segment"):

            if segment_to_chain_mapping is None:
                chain = None

            elif type(segment_to_chain_mapping) is str:
                chain = segment_to_chain_mapping

            elif segment_name not in segment_to_chain_mapping:
                raise ValueError("Segment name {} has no mapping to PyMOL "
                                 "chain. Available mappings are: {}".format(
                                     segment_name, segment_to_chain_mapping))
            else:
                chain = segment_to_chain_mapping[segment_name]

            # aggregate into positional information
            _t = _t.loc[:,
                        ["pos", effect_column]].rename(columns={
                            "pos": "i",
                            effect_column: "effect"
                        })

            t_agg = _t.groupby("i").agg(agg_func).reset_index()
            t_agg.loc[:, "i"] = pd.to_numeric(t_agg.i).astype(int)

            # map aggregated effects to colors
            max_val = t_agg.effect.abs().max()
            mapper = colormap(-max_val, max_val, cmap)
            t_agg.loc[:, "color"] = t_agg.effect.map(mapper)
            t_agg.loc[:, "show"] = "spheres"

            if chain is not None:
                chain_sel = ", chain '{}'".format(chain)
            else:
                chain_sel = ""

            f.write("as cartoon{}\n".format(chain_sel))
            f.write("color grey80{}\n".format(chain_sel))

            pymol_mapping(t_agg, f, chain, atom="CA")
Exemple #3
0
def mutation_pymol_script(mutation_table,
                          output_file,
                          effect_column="prediction_epistatic",
                          mutant_column="mutant",
                          agg_func="mean",
                          cmap=plt.cm.RdBu_r,
                          chain=None):
    """
    Create a Pymol .pml script to visualize single mutation
    effects

    Parameters
    ----------
    mutation_table : pandas.DataFrame
        Table with mutation effects (will be filtered
        for single mutants)
    output_file : str
        File path where to store pml script
    effect_column : str, optional (default: "prediction_epistatic")
        Column in mutation_table that contains mutation effects
    mutant_column : str, optional (default: "mutant")
        Column in mutation_table that contains mutations
        (in format "A123G")
    agg_func : str, optional (default: "mean")
        Function used to aggregate single mutations into one
        aggregated effect per position (any pandas aggregation
        operation, including "mean", "min, "max")
    cmap : matplotlib.colors.LinearSegmentedColormap, optional
            (default: plt.cm.RdBu_r)
        Colormap used to map mutation effects to colors
    chain : str, optional (default: None)
        Use this PDB chain in residue selection

    Raises
    ------
    ValueError
        If no single mutants contained in mutation_table
    """
    # split mutation strings
    t = split_mutants(mutation_table, mutant_column)

    # only pick single mutants
    t = t.query("num_mutations == 1")

    if len(t) == 0:
        raise ValueError("mutation_table does not contain any single "
                         "amino acid substitutions.")

    # aggregate into positional information
    t = t.loc[:, ["pos", effect_column]].rename(columns={
        "pos": "i",
        effect_column: "effect"
    })

    t_agg = t.groupby("i").agg(agg_func).reset_index()
    t_agg.loc[:, "i"] = pd.to_numeric(t_agg.i).astype(int)

    # map aggregated effects to colors
    max_val = t_agg.effect.abs().max()
    mapper = colormap(-max_val, max_val, cmap)
    t_agg.loc[:, "color"] = t_agg.effect.map(mapper)
    t_agg.loc[:, "show"] = "spheres"

    if chain is not None:
        chain_sel = ", chain '{}'".format(chain)
    else:
        chain_sel = ""

    with open(output_file, "w") as f:
        f.write("as cartoon{}\n".format(chain_sel))
        f.write("color grey80{}\n".format(chain_sel))

        pymol_mapping(t_agg, f, chain, atom="CA")