Esempio n. 1
0
def plot_autocorrelation(
    df,
    col,
    plot_type="acf",
    n_lags=40,
    fft=False,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Correlation estimate using partial autocorrelation or autocorrelation.

    Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col: The feature of interest
        plot_type: Choose between 'acf' or 'pacf. Defaults to "pacf".
        n_lags: Number of lags to return autocorrelation for. Defaults to 40.
        fft: If True, computes ACF via fourier fast transform (FFT). Defaults to False.
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: `col` not found in dataframe.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if isinstance(col, str):
        if col not in df.columns:
            raise ValueError(f"{col} not found in dataframe")
    if viz_backend == "plotly":
        data, white_noise = _get_compute_backend(
            compute_backend, df).compute_autocorrelation(df[col],
                                                         plot_type=plot_type,
                                                         n_lags=n_lags,
                                                         fft=fft,
                                                         **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            data,
            plot_type=plot_type,
            white_noise=white_noise,
            n_lags=n_lags,
            **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            df[col], plot_type=plot_type, n_lags=n_lags, fft=fft, **kwargs)
    return fig
Esempio n. 2
0
    def plot_distribution(
        self,
        x: Optional[str] = None,
        contrast: Optional[str] = None,
        viz_backend: Optional[str] = None,
        **kwargs,
    ):
        """Generate distribution plot(s).

        Numeric features will be visualized using a histogram/violin plot, and any other
        types will be visualized using a categorical bar plot.

        Args:
            x (str, optional): The feature name to plot. If None, will plot all features.
            contrast (str, optional): The feature name to compare histograms by contrast.
            mode (str): {'combo', 'violin', 'hist'} The type of plot to display.
                Defaults to a combined histogram/violin plot.
            hist_kwargs (dict, optional): Keyword args for seaborn.histplot.
            violin_kwargs (dict, optional): Keyword args for seaborn.violinplot.
            viz_backend (optional): The visualization backend.
            **kwargs: Additional keyword arguments for the visualization backend.

        Returns:
            Histogram plot(s).
        """
        backend = viz_backend or self.viz_backend

        return _get_viz_backend(backend).viz_distribution(data=self.input_data,
                                                          x=x,
                                                          contrast=contrast,
                                                          **kwargs)
Esempio n. 3
0
    def show(self, viz_backend=None, **kwargs):
        """The default display for this output.

        Displays the clustered, projected data as a scatter plot, with points colored by
            the cluster labels.

        Args:
            viz_backend: The visualization backend.
            **kwargs: Keyword arguments.

        Raises:
            ValueError: Data to visualize is missing / not calculated.

        Returns:
            The cluster plot.
        """
        backend = viz_backend or self.viz_backend

        if self.viz_data is None:
            raise ValueError("Could not find data to visualize.")

        return _get_viz_backend(backend).viz_cluster(
            self.viz_data,
            method=self.method,
            xlabel=self.xlabel,
            ylabel=self.ylabel,
            **kwargs,
        )
Esempio n. 4
0
    def show(self, viz_backend=None, **kwargs):
        """The default display for this output.

        Shows the data heatmap plot.

        Args:
            viz_backend: The visualization backend.
            **kwargs: Keyword arguments.

        Raises:
            ValueError: Computed data is missing.

        Returns:
            The correlation matrix plot.
        """
        backend = viz_backend or self.viz_backend

        if self.viz_data is None:
            raise ValueError("Could not find data to visualize.")

        return _get_viz_backend(backend).viz_data_heatmap(
            self.viz_data,
            colnames=self.colnames,
            missing=self.missing,
            **kwargs)
Esempio n. 5
0
    def show(self, viz_backend=None, **kwargs):
        """The default display for this output.

        Displays a scatter plot matrix.

        Args:
            viz_backend: The visualization backend.
            **kwargs: Keyword arguments.

        Raises:
            ValueError: No numeric data to plot.

        Returns:
            The correlation matrix plot.
        """
        if self.num_data is None:
            raise ValueError("Could not find data to visualize.")

        viz_backend = viz_backend or self.viz_backend

        return _get_viz_backend(viz_backend).viz_scatter_plot(
            self.num_data,
            self.mode,
            self.sample,
            self.diagnostics,
            self.threshold,
            **{
                **self.kwargs,
                **kwargs
            },
        )
Esempio n. 6
0
def plot_time_series(
    df,
    col,
    decompose=False,
    model="additive",
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Plots time series given a dataframe with datetime index. Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col (str or [str]): Column of interest. Column datatype must be numerical
        decompose: Set as True to decompose the timeseries with moving average. Defaults to False.
        model: Specify seasonal component when decompose is True. Defaults to "additive".
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: ```col``` not a list or string.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if not isinstance(col, (list, str)):
        raise ValueError(f"{col} must be list type or string type")
    if decompose:
        result = _get_compute_backend(
            compute_backend, df).compute_decompose_timeseries(df,
                                                              col=col,
                                                              model=model,
                                                              **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col=col, result=result, decompose=decompose, **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col, **kwargs)
    return fig
Esempio n. 7
0
def test_api_methods(backend, backend_module, api_method):
    if backend == "compute":
        if backend_module in ["pandas", "modin"]:
            if (api_method not in [
                    "distribution", "scatter_plot", "correlation_matrix"
            ]) and not (backend_module == "modin" and
                        (api_method in ["data_heatmap", "cluster"])):
                _get_compute_backend(backend_module).__getattr__("_".join(
                    [backend, api_method]))
    elif backend == "viz":
        if backend_module in ["seaborn", "plotly"]:
            if api_method not in [
                    "data_summary",
                    "distribution",
                    "scatter_plot",
            ]:
                _get_viz_backend(backend_module).__getattr__("_".join(
                    [backend, api_method]))
    else:
        pytest.skip(f"Skipped {backend}({backend_module})_{api_method}")
Esempio n. 8
0
    def visualize_topic_summary(self, viz_backend: str = "pyLDAvis"):
        """Displays interactive pyLDAvis visual to understand topic model and documents.

        Args:
            viz_backend (str): The visualization backend.

        Raises:
            TypeError: Only valid for LDA models.

        Returns:
            A visual to understand topic model and/or documents relating to model
        """
        if self._model_type != "LDA":
            raise TypeError("Model must be an LDA Model")
        else:
            return _get_viz_backend(viz_backend).viz_visualize_topic_summary(
                self._model, self._corpus, self._dictionary)
Esempio n. 9
0
def importance(
    data,
    target: str,
    preprocess_func=None,
    estimator=None,
    return_values: bool = False,
    truncate: bool = True,
    top_features: Optional[int] = None,
    compute_backend: Optional[str] = None,
    viz_backend: Optional[str] = None,
    **kwargs,
):
    """Variable importance chart.

    This feature utilizes fits a simple model to the dataset to generate an estimate
    of feature importance (predictive power). Note that these results are dependent on
    the accuracy of the fitted model and should refined during modeling.

    Args:
        data: A Pandas data frame
        target: Name of the response column, as a string
        preprocess_func: A custom preprocessing function that takes a Pandas dataframe and the target/response column as a string. Returns X and y as tuple.
        estimator: A custom sklearn estimator. Default is Random Forest Classifier
        return_values: If True, only the importance values as a numpy array
        truncate: If True, negative importance values will be truncated (set to zero)
        top_features: Return the top N most important features. Default is None (all features)
        compute_backend: The compute backend
        viz_backend: The visualization backend
        **kwargs: Other arguments to be passed to the preprocess function

    Returns:
        Matplotlib figure
    """
    importance_values, idx, cols = _get_compute_backend(
        compute_backend, data).compute_importance(data, target,
                                                  preprocess_func, estimator,
                                                  truncate, **kwargs)

    if return_values:
        return importance_values
    else:
        top_features = top_features or len(cols)
        return _get_viz_backend(viz_backend).viz_importance(
            importance_values, idx[:top_features], cols)
Esempio n. 10
0
    def cluster_search_plot(self, viz_backend=None, **kwargs):
        """Shows the results of cluster search.

        Cluster search attempts to find an optimal n_clusters by maximizing on some criterion.
        This plot shows a line plot of each n_cluster that was attempted and its score.

        Args:
            viz_backend: The visualization backend.
            **kwargs: Additional keyword arguments to pass to the visualization backend.

        Raises:
            ValueError: Cluster `search` is False.

        Returns:
            The plot
        """
        if not self.search:
            raise ValueError("Cluster search plot is not applicable.")

        return _get_viz_backend(viz_backend).viz_cluster_search_plot(
            self.cluster_range, self.scores, self.metric, **kwargs)
Esempio n. 11
0
    def elbow_plot(self, viz_backend: str = None):
        """Creates an elbow plot displaying coherence values vs number of topics.

        Args:
            viz_backend: The visualization backend.

        Raises:
            ValueError: No coherence values to plot.

        Returns:
            fig: Elbow plot showing coherence values vs number of topics
        """
        try:
            self._coherence_values
        except AttributeError:
            raise ValueError(
                "Coherence values not defined. At least 2 LDA or LSI models need to be"
                " trained with different numbers of topics.")
        else:
            return _get_viz_backend(viz_backend).viz_elbow_plot(
                self._min_topics, self._max_topics, self._coherence_values)
Esempio n. 12
0
def test_missing_viz_implementation():
    with pytest.raises(NotImplementedError):
        _get_viz_backend().viz_nothing()