def __init__(
        self,
        df=None,
        minimal=False,
        explorative=False,
        config_file: Union[Path, str] = None,
        lazy: bool = True,
        **kwargs,
    ):
        """Generate a ProfileReport based on a pandas DataFrame

        Args:
            df: the pandas DataFrame
            minimal: minimal mode is a default configuration with minimal computation
            config_file: a config file (.yml), mutually exclusive with `minimal`
            lazy: compute when needed
            **kwargs: other arguments, for valid arguments, check the default configuration file.
        """
        if config_file is not None and minimal:
            raise ValueError(
                "Arguments `config_file` and `minimal` are mutually exclusive."
            )

        if df is None and not lazy:
            raise ValueError(
                "Can init a not-lazy ProfileReport with no DataFrame")

        if config_file:
            config.set_file(config_file)
        elif minimal:
            config.set_file(get_resource("configs/config_minimal.yaml"))
        elif explorative:
            config.set_file(get_resource("configs/config_explorative.yaml"))
        elif not config.is_default:
            pass
            # TODO: logging instead of warning
            # warnings.warn(
            #     "Currently configuration is not the default, if you want to restore "
            #     "default configuration, please run 'pandas_profiling.clear_config()'"
            # )

        config.set_kwargs(kwargs)

        self.df = None
        self._df_hash = -1
        self._description_set = None
        self._title = None
        self._report = None
        self._html = None
        self._widgets = None
        self._json = None

        if df is not None:
            # preprocess df
            self.df = self.preprocess(df)

        if not lazy:
            # Trigger building the report structure
            _ = self.report
def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """Scatter plot (or hexbin plot) from one series of sequences with length 2

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series: the Series
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plt.xlabel(x_label)
        plt.ylabel(y_label)

        color = config["html"]["style"]["primary_color"].get(str)
        scatter_threshold = config["plot"]["scatter_threshold"].get(int)

        if len(series) > scatter_threshold:
            cmap = sns.light_palette(color, as_cmap=True)
            plt.hexbin(*zip(*series.tolist()), cmap=cmap)
        else:
            plt.scatter(*zip(*series.tolist()), color=color)
        return plot_360_n0sc0pe(plt)
def scatter_dataset(data: pd.DataFrame,
                    labels=None,
                    visualisation=PCA(random_state=0),
                    n_components=2,
                    figsize=(6.5, 6.5)) -> str:
    """Generate scatter plot of the whole dataset

    Args:
      data: Pandas DataFrame to generate scatter plot from.
      visualisation: visualisation technique.
      n_components: number of components.

    Returns:
      The resulting scatter plot encoded as a string.
      :param labels:
      :param figsize:
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        fig = plt.figure(
            figsize=figsize if n_components == 2 else (figsize[0] + 1,
                                                       figsize[1] + 1))
        plot = fig.add_subplot(111)
        if n_components == 3:
            plot = fig.add_subplot(111, projection="3d")
        plot.set_xlabel("x")
        plot.set_ylabel("y")
        if n_components == 3:
            plot.set_zlabel("z")
        visualisation.n_components = n_components
        _plot_dataset(plot, data, labels, visualisation)
        plt.subplots_adjust(bottom=0.2)
        return plot_360_n0sc0pe(plt)
def missing_bar(data: pd.DataFrame) -> str:
    """Generate missing values bar plot.

    Args:
      data: Pandas DataFrame to generate missing values bar plot from.

    Returns:
      The resulting missing values bar plot encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        labels = config["plot"]["missing"]["force_labels"].get(bool)
        ax = missingno.bar(
            data,
            figsize=(10, 5),
            color=hex_to_rgb(
                config["html"]["style"]["primary_color"].get(str)),
            fontsize=get_font_size(data),
            labels=labels,
        )
        for _, spine in ax.spines.items():
            spine.set_visible(True)
        for ax0 in plt.gcf().get_axes():
            ax0.grid(False)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3)

        return plot_360_n0sc0pe(plt)
def scatter_complex(series: pd.Series) -> str:
    """Scatter plot (or hexbin plot) from a series of complex values

    Examples:
        >>> complex_series = pd.Series([complex(1, 3), complex(3, 1)])
        >>> scatter_complex(complex_series)

    Args:
        series: the Series

    Returns:
        A string containing (a reference to) the image
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plt.ylabel("Imaginary")
        plt.xlabel("Real")

        color = config["html"]["style"]["primary_color"].get(str)
        scatter_threshold = config["plot"]["scatter_threshold"].get(int)

        if len(series) > scatter_threshold:
            cmap = sns.light_palette(color, as_cmap=True)
            plt.hexbin(series.real, series.imag, cmap=cmap)
        else:
            plt.scatter(series.real, series.imag, color=color)

        return plot_360_n0sc0pe(plt)
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        fig_cor, axes_cor = plt.subplots()
        cmap_name = config["plot"]["correlation"]["cmap"].get(str)
        cmap_bad = config["plot"]["correlation"]["bad"].get(str)

        cmap = plt.get_cmap(cmap_name)
        if vmin == 0:
            cmap = get_cmap_half(cmap)
        cmap.set_bad(cmap_bad)

        labels = data.columns
        matrix_image = axes_cor.imshow(data,
                                       vmin=vmin,
                                       vmax=1,
                                       interpolation="nearest",
                                       cmap=cmap)
        cbar = plt.colorbar(matrix_image)
        cbar.outline.set_visible(False)

        if data.isnull().values.any():
            legend_elements = [
                Patch(facecolor=cmap(np.nan), label="invalid\ncoefficient")
            ]

            plt.legend(
                handles=legend_elements,
                loc="upper right",
                handleheight=2.5,
            )

        axes_cor.set_xticks(
            np.arange(0, data.shape[0],
                      float(data.shape[0]) / len(labels)))
        axes_cor.set_yticks(
            np.arange(0, data.shape[1],
                      float(data.shape[1]) / len(labels)))

        font_size = get_correlation_font_size(len(labels))
        axes_cor.set_xticklabels(labels, rotation=90, fontsize=font_size)
        axes_cor.set_yticklabels(labels, fontsize=font_size)
        plt.subplots_adjust(bottom=0.2)
        return plot_360_n0sc0pe(plt)
def predictivity(data: pd.DataFrame) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.

    Returns:
      The resulting predictivity plot encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        target_variables = config["correlations"]["targets"].get()
        if len(target_variables) == 0:
            target_variables = list(
                data.select_dtypes(include=np.number).columns)
        palette = sns.color_palette().as_hex()
        tmp = palette[3]
        palette[3] = palette[1]
        palette[1] = tmp

        fig_pred, axes_pred = plt.subplots()
        axes_pred.set_ylim(0, 100)

        # Rescale in range [0, 100] for better visualization
        predictivity = (100 *
                        data[target_variables].round(2).abs()).astype(int)

        # Barplot predictivity
        predictivity.plot.bar(
            figsize=(10, 6),
            width=0.8,
            legend=True,
            fontsize=get_predictivity_font_size(predictivity),
            rot=45,
            ax=axes_pred,
            color=palette)
        for patch in axes_pred.patches:
            axes_pred.annotate(patch.get_height(),
                               (patch.get_x() + patch.get_width() / 2., 100),
                               ha="center",
                               va="center",
                               xytext=(0, 15),
                               textcoords="offset points",
                               rotation=45)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.2)
        return plot_360_n0sc0pe(plt)
def missing_dendrogram(data: pd.DataFrame) -> str:
    """Generate a dendrogram plot for missing values.

    Args:
      data: Pandas DataFrame to generate missing values dendrogram plot from.

    Returns:
      The resulting missing values dendrogram plot encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
        return plot_360_n0sc0pe(plt)
Esempio n. 9
0
def test_double_config(console_data, test_output_dir):
    report = test_output_dir / "test_double_config.html"
    with pytest.raises(ValueError) as e:
        console.main(
            [
                "-s",
                "--config_file",
                str(get_resource("configs/config_default.yaml")),
                "--minimal",
                str(console_data),
                str(report),
            ]
        )

    assert (
        str(e.value) == "Arguments `config_file` and `minimal` are mutually exclusive."
    )
Esempio n. 10
0
def clustermap(data: pd.DataFrame) -> str:
    """Plot a clustermap of the data.

    Args:
      series: The data to plot.

    Returns:
      The resulting clustermap encoded as a string.
      :param data:

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_clustermap(data)
        return plot_360_n0sc0pe(plt)
Esempio n. 11
0
def boxplot(series: np.ndarray, series_description: dict) -> str:
    """Plot a boxplot of the data.

    Args:
      series: The data to plot.
      series_description:

    Returns:
      The resulting boxplot encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_boxplot(series, series_description)
        plot.figure.tight_layout()
        return plot_360_n0sc0pe(plt)
Esempio n. 12
0
def histogram(series: np.ndarray, series_description: dict,
              bins: Union[int, np.ndarray]) -> str:
    """Plot an histogram of the data.

    Args:
      series: The data to plot.
      series_description:
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plot = _plot_histogram(series, series_description, bins)
        plot.xaxis.set_tick_params(rotation=45)
        plot.figure.tight_layout()
        return plot_360_n0sc0pe(plt)
def missing_heatmap(data: pd.DataFrame) -> str:
    """Generate missing values heatmap plot.

    Args:
      data: Pandas DataFrame to generate missing values heatmap plot from.

    Returns:
      The resulting missing values heatmap plot encoded as a string.
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        height = 4
        if len(data.columns) > 10:
            height += int((len(data.columns) - 10) / 5)
        height = min(height, 10)

        font_size = get_font_size(data)
        if len(data.columns) > 40:
            font_size /= 1.4

        labels = config["plot"]["missing"]["force_labels"].get(bool)
        ax = missingno.heatmap(
            data,
            figsize=(10, height),
            fontsize=font_size,
            cmap=config["plot"]["missing"]["cmap"].get(str),
            labels=labels,
        )
        for _, spine in ax.spines.items():
            spine.set_visible(True)
        if len(data.columns) > 40:
            plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.3)
        else:
            plt.subplots_adjust(left=0.2, right=0.9, top=0.8, bottom=0.3)

        return plot_360_n0sc0pe(plt)
Esempio n. 14
0
def scatter_pairwise(series1, series2, x_label, y_label) -> str:
    """Scatter plot (or hexbin plot) from two series

    Examples:
        >>> widths = pd.Series([800, 1024])
        >>> heights = pd.Series([600, 768])
        >>> scatter_series(widths, heights, "Width", "Height")

    Args:
        series1: the series corresponding to the x-axis
        series2: the series corresponding to the y-axis
        x_label: the label on the x-axis
        y_label: the label on the y-axis

    Returns:
        A string containing (a reference to) the image
    """
    with matplotlib.style.context([
            "seaborn-ticks",
            str(get_resource("styles/pandas_profiling_frame.mplstyle"))
    ]):
        plt.xlabel(x_label)
        plt.ylabel(y_label)

        color = config["html"]["style"]["primary_color"].get(str)
        scatter_threshold = config["plot"]["scatter_threshold"].get(int)

        if len(series1) > scatter_threshold:
            cmap = sns.light_palette(color, as_cmap=True)
            plt.hexbin(series1.tolist(),
                       series2.tolist(),
                       gridsize=15,
                       cmap=cmap)
        else:
            plt.scatter(series1.tolist(), series2.tolist(), color=color)
        return plot_360_n0sc0pe(plt)
 def __init__(self):
     """The config constructor should be called only once."""
     if self.config is None:
         self.clear()
     else:
         self.set_file(str(get_resource("configs/config_default.yaml")))
 def clear(self):
     self.config = confuse.Configuration("PandasProfiling", __name__, read=False)
     self.set_file(str(get_resource("configs/config_default.yaml")))
Esempio n. 17
0
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import seaborn as sns
from sklearn.decomposition import PCA

from pandas_profiling.config import config
from pandas_profiling.utils.resources import get_resource
from pandas_profiling.visualisation.utils import hex_to_rgb, plot_360_n0sc0pe

register_matplotlib_converters()
matplotlib.style.use(str(get_resource("styles/pandas_profiling.mplstyle")))
sns.set_style(style="white")


def _plot_boxplot(
        series: np.ndarray,
        series_description: dict,
        figsize: tuple = (6, 4),
):
    """Plot a boxplot from the data and return the AxesSubplot object.

    Args:
        series: The data to plot
        figsize: The size of the figure (width, height) in inches, default (6,4)

    Returns: