Example #1
0
def data_heatmap(data,
                 missing=False,
                 compute_backend=None,
                 viz_backend=None,
                 **kwargs) -> HeatmapWidget:
    """Visualizes data patterns in the entire dataset by visualizing as a heatmap.

    This feature operates in two modes.

    (Default): A data heatmap showing standardized values (bounded to [-3, 3]). This
    visualization is useful for showing unusual, ordered patterns in the data that
    would otherwise be unnoticeable in summary statistics or distribution plots.

    Missing: Visualize only missing values.

    Args:
        data: A pandas data frame
        missing (bool): If True, show only missing values
        compute_backend: The compute backend.
        viz_backend: The visualization backend.
        **kwargs: Keyword arguments

    Returns:
        The data heatmap.
    """
    hwidget = _get_compute_backend(compute_backend,
                                   data).compute_data_heatmap(data,
                                                              missing=missing,
                                                              **kwargs)
    hwidget.viz_backend = viz_backend
    return hwidget
Example #2
0
def distribution(data,
                 diagnostic=True,
                 compute_backend=None,
                 viz_backend=None,
                 **kwargs) -> DistributionWidget:
    """Distribution Plots.

    Visualizes univariate distributions. This feature can be used for generating
    various types of plots for univariate distributions, including: histograms, violin
    plots, bar (count) plots.

    Args:
        data: Data Frame
        diagnostic: If True, will run diagnostics to select "interesting" plots.
        compute_backend: The compute backend.
        viz_backend: The visualization backend.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Invalid input data type.

    Returns:
        DistributionWidget
    """
    if not _is_dataframe(data):
        raise ValueError("DataFrame required.")

    widget = _get_compute_backend(compute_backend, data).compute_distribution(
        data, diagnostic=diagnostic, **kwargs)
    return widget
Example #3
0
def stationarity_test(df,
                      col,
                      test="dickey-fuller",
                      regression="c",
                      compute_backend=None,
                      **kwargs):
    """Perform stationarity tests to see if mean and variance are changing over time.

    Backend uses statsmodel's statsmodels.tsa.stattools.adfuller or statsmodels.tsa.stattools.kpss

    Args:
        df: The dataframe. Must contain a datetime index
        col: The feature of interest
        test: Choice of stationarity test. "kpss" or "dickey-fuller". Defaults to "dickey-fuller".
        regression: Constant and trend order to include in regression. Choose between 'c','ct','ctt', and 'nc'. Defaults to 'c'
        compute_backend: Select computing backend. Defaults to None (pandas).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: `col` not found in dataframe.

    Returns:
        Pandas dataframe containing the statistics
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if not isinstance(col, str):
        raise ValueError(f"{col} not found in dataframe")

    data = _get_compute_backend(compute_backend, df).compute_stationarity_test(
        df[col], test, regression, **kwargs)
    return data
Example #4
0
def test_api_methods(backend, backend_module, api_method):
    if backend == "compute":
        if backend_module in ["pandas", "modin"]:
            if (api_method not in [
                    "distribution", "scatter_plot", "correlation_matrix"
            ]) and not (backend_module == "modin" and
                        (api_method in ["data_heatmap", "cluster"])):
                _get_compute_backend(backend_module).__getattr__("_".join(
                    [backend, api_method]))
    elif backend == "viz":
        if backend_module in ["seaborn", "plotly"]:
            if api_method not in [
                    "data_summary",
                    "distribution",
                    "scatter_plot",
            ]:
                _get_viz_backend(backend_module).__getattr__("_".join(
                    [backend, api_method]))
    else:
        pytest.skip(f"Skipped {backend}({backend_module})_{api_method}")
Example #5
0
def plot_autocorrelation(
    df,
    col,
    plot_type="acf",
    n_lags=40,
    fft=False,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Correlation estimate using partial autocorrelation or autocorrelation.

    Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col: The feature of interest
        plot_type: Choose between 'acf' or 'pacf. Defaults to "pacf".
        n_lags: Number of lags to return autocorrelation for. Defaults to 40.
        fft: If True, computes ACF via fourier fast transform (FFT). Defaults to False.
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: `col` not found in dataframe.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if isinstance(col, str):
        if col not in df.columns:
            raise ValueError(f"{col} not found in dataframe")
    if viz_backend == "plotly":
        data, white_noise = _get_compute_backend(
            compute_backend, df).compute_autocorrelation(df[col],
                                                         plot_type=plot_type,
                                                         n_lags=n_lags,
                                                         fft=fft,
                                                         **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            data,
            plot_type=plot_type,
            white_noise=white_noise,
            n_lags=n_lags,
            **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_autocorrelation(
            df[col], plot_type=plot_type, n_lags=n_lags, fft=fft, **kwargs)
    return fig
Example #6
0
def correlation_matrix(
    data,
    cluster=False,
    categorical=False,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
) -> CorrelationWidget:
    """Computes correlations (associations) and visualizes as a heatmap.

    This feature combines measures of association for pairs of variables:
        * Numeric-numeric pairs: Pearson correlation
        * Categorical-numeric pairs: Correlation ratio
        * Categorical-categorical pairs
            * More than 2 levels: Cramer's V
            * Only 2 levels for both variables: Point-biserial coefficient

    Args:
        data (DataFrame): A data frame
        cluster (bool): If True, use clustering to reorder similar columns together
        categorical (bool): If True, include categorical associations using Cramer's
            V, Correlation Ratio, and Point-biserial coefficient (a.k.a. Matthews
            correlation coefficient). All associations (including Pearson correlation)
            are scaled to be in the range [0, 1].
        compute_backend: The compute backend.
        viz_backend: The visualization backend.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Invalid data input type.

    Returns:
        CorrelationWidget
    """
    if not _is_dataframe(data):
        raise ValueError("Data frame required")

    corrwidget = _get_compute_backend(compute_backend,
                                      data).compute_correlation_matrix(
                                          data,
                                          cluster=cluster,
                                          categorical=categorical,
                                          **kwargs)

    corrwidget.viz_backend = viz_backend

    return corrwidget
Example #7
0
def scatter_plots(
    data,
    mode="matrix",
    sample=None,
    threshold=None,
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Scatter plots of numeric data.

    Args:
        data: A Pandas data frame
        mode (str): {``diagnostic``, ``matrix``, ``all``} The visualization mode.

            * ``diagnostic``: Plots selected by scagnostics (scatter plot diagnostics)
            * ``matrix``: Generate the full scatter plot matrix
            * ``all``: Generate all individual scatter plots
        sample: The sampling method to use. Currently not used.
        threshold: The scatter plot diagnostic threshold value [0,1] for returning a
            plot. Only used with "diagnostic" mode. For example, ``{"Outlying": 0.9}``
            returns plots with outlier metrics above 0.9. See
            ``pyscagnostics.measure_names`` for a list of metrics.

            * If a number: Returns all plots where at least one metric is above this threshold
            * If a dictionary: Returns plots where the metric is above its threshold.
        compute_backend: The compute backend
        viz_backend: The vizualization backend
        **kwargs: Passed to the visualization framework

    Raises:
        ValueError: Invalid input data type.

    Returns:
        Scatter plot.
    """
    if not _is_dataframe(data):
        raise ValueError("Unsupported input data type")

    swidget = _get_compute_backend(compute_backend, data).compute_scatter_plot(
        data, mode, sample, threshold, **kwargs)

    swidget.compute_backend = compute_backend
    swidget.viz_backend = viz_backend
    return swidget
Example #8
0
def importance(
    data,
    target: str,
    preprocess_func=None,
    estimator=None,
    return_values: bool = False,
    truncate: bool = True,
    top_features: Optional[int] = None,
    compute_backend: Optional[str] = None,
    viz_backend: Optional[str] = None,
    **kwargs,
):
    """Variable importance chart.

    This feature utilizes fits a simple model to the dataset to generate an estimate
    of feature importance (predictive power). Note that these results are dependent on
    the accuracy of the fitted model and should refined during modeling.

    Args:
        data: A Pandas data frame
        target: Name of the response column, as a string
        preprocess_func: A custom preprocessing function that takes a Pandas dataframe and the target/response column as a string. Returns X and y as tuple.
        estimator: A custom sklearn estimator. Default is Random Forest Classifier
        return_values: If True, only the importance values as a numpy array
        truncate: If True, negative importance values will be truncated (set to zero)
        top_features: Return the top N most important features. Default is None (all features)
        compute_backend: The compute backend
        viz_backend: The visualization backend
        **kwargs: Other arguments to be passed to the preprocess function

    Returns:
        Matplotlib figure
    """
    importance_values, idx, cols = _get_compute_backend(
        compute_backend, data).compute_importance(data, target,
                                                  preprocess_func, estimator,
                                                  truncate, **kwargs)

    if return_values:
        return importance_values
    else:
        top_features = top_features or len(cols)
        return _get_viz_backend(viz_backend).viz_importance(
            importance_values, idx[:top_features], cols)
Example #9
0
def plot_time_series(
    df,
    col,
    decompose=False,
    model="additive",
    compute_backend=None,
    viz_backend=None,
    **kwargs,
):
    """Plots time series given a dataframe with datetime index. Statistics are computed using the statsmodels API.

    Args:
        df: The dataframe with datetime index
        col (str or [str]): Column of interest. Column datatype must be numerical
        decompose: Set as True to decompose the timeseries with moving average. Defaults to False.
        model: Specify seasonal component when decompose is True. Defaults to "additive".
        compute_backend: Select computing backend. Defaults to None (pandas).
        viz_backend: Select visualization backend. Defaults to None (seaborn).
        **kwargs: Keyword arguments

    Raises:
        ValueError: Invalid input data type.
        ValueError: ```col``` not a list or string.

    Returns:
        The visualization
    """
    if not _is_dataframe(df):
        raise ValueError("Unsupported input data type")
    if not isinstance(col, (list, str)):
        raise ValueError(f"{col} must be list type or string type")
    if decompose:
        result = _get_compute_backend(
            compute_backend, df).compute_decompose_timeseries(df,
                                                              col=col,
                                                              model=model,
                                                              **kwargs)
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col=col, result=result, decompose=decompose, **kwargs)
    else:
        fig = _get_viz_backend(viz_backend).viz_plot_time_series(
            df, col, **kwargs)
    return fig
Example #10
0
def test_missing_compute_implementation():
    with pytest.raises(NotImplementedError):
        _get_compute_backend().compute_nothing()
Example #11
0
def cluster(
    data,
    method="kmeans",
    dim_method="pca",
    compute_backend=None,
    viz_backend=None,
    **kwargs,
) -> ClusterWidget:
    """Unsupervised determination of clusters.

    This feature computes clusters using various algorithms (KMeans, HDBSCAN) and then
    projects the data onto a two-dimensional plot for visualization.

    Args:
        data (DataFrame): The data.
        method (str, optional): {'kmeans', 'hdbscan'} The clustering method.
        dim_method (str, optional): The method to use for dimensionality reduction.
        compute_backend (str, optional): The compute backend.
        viz_backend (str, optional): The visualization backend.
        n_clusters (Optional[int], optional): (KMeans) The number of clusters.
        cluster_range (Tuple[int, int], optional): (KMeans) A tuple of the minimum and
            maximum cluster search range. Defaults to (2, 20).
        metric (str): (KMeans) The metric to optimize (from sklearn.metrics).
        target: (KMeans) The labels for supervised clustering, as a 1-D array.
        **kwargs: Keyword arguments.

    Raises:
        ValueError: Data frame required
        ValueError: Clustering method not implemented

    Returns:
        ClusterWidget
    """
    if not _is_dataframe(data):
        raise ValueError("Data frame required")

    if method not in ["kmeans", "hdbscan"]:
        raise ValueError(f"{method} not implemented")

    data = data.select_dtypes("number")

    clusterwidget = _get_compute_backend(compute_backend,
                                         data).compute_cluster(data=data,
                                                               method=method,
                                                               **kwargs)

    viz_data, reductor = dim_reduc(clusterwidget.scaled_data,
                                   2,
                                   dim_method=dim_method)
    viz_data.columns = ["x", "y"]
    viz_data["clusters"] = clusterwidget.clusters

    clusterwidget.viz_data = viz_data
    clusterwidget.reductor = reductor

    if dim_method == "pca":
        var_explained = np.round(reductor.explained_variance_ratio_[:2],
                                 2) * 100
        clusterwidget.xlabel = f"Component 1 ({var_explained[0]}% variance explained)"
        clusterwidget.ylabel = f"Component 2 ({var_explained[1]}% variance explained)"
    else:
        clusterwidget.xlabel = "Dimension 1"
        clusterwidget.ylabel = "Dimension 2"

    clusterwidget.viz_backend = viz_backend

    return clusterwidget
Example #12
0
def sensitive_data(
    df,
    mode: str = "redact",
    detect_infotypes: bool = True,
    columns: Optional[list] = None,
    score_threshold: float = _DEFAULT_SCORE_THRESHOLD,
    sample_size: int = _SAMPLE_SIZE,
    engine_backend=None,
    compute_backend: Optional[str] = None,
):
    """Identifies, redacts, and/or encrypts PII data.

    Note:
        `sensitive_data` uses Microsoft's Presidio in the backend. Presidio can be used
        to help identify sensitive data. However, because Presidio uses trained ML models,
        there is no guarantee that Presidio will find all sensitive information.

    Args:
        df (DataFrame): The dataframe
        mode (str): {'redact', 'encrypt'}
            redact: Redact the sensitive data
            encrypt: Anonymize the sensitive data
        detect_infotypes (bool): If True, identifies infotypes for each column
        columns ([str]): Defaults to None
        score_threshold (float): Minimum confidence value for detected entities to be returned. Default is 0.2.
        sample_size (int): Number of sampled rows used for identifying column infotypes. Default is 100.
        engine_backend: The backend analyzer engine. Default is presidio_analyzer.
        compute_backend (str): Select compute backend

    Raises:
        ValueError: Invalid input data type.
        TypeError: `columns` not a list of strings.

    Returns:
        SensitiveDataWidget
    """
    if not engine_backend:
        engine_backend = presidio_engine()

    if not _is_dataframe(df):
        raise ValueError("Pandas data frame or modin data frame required")

    if _compat.check_install("modin.pandas"):
        if _is_dataframe(df, "modin"):
            warnings.warn(
                "Sensitive data does not currently support Modin DataFrames. Converting to Pandas."
            )
            df = df._to_pandas()

    if columns:
        if not isinstance(columns, list):
            raise TypeError("cols must be type list")

    if mode not in ["encrypt", "redact", None]:
        raise ValueError("mode must be set to 'encrypt', 'redact', or None")

    sensitivewidget = _get_compute_backend(
        compute_backend, df).compute_sensitive_data(
            df=df,
            mode=mode,
            detect_infotypes=detect_infotypes,
            columns=columns,
            score_threshold=score_threshold,
            sample_size=sample_size,
            engine_backend=engine_backend,
        )

    sensitivewidget.columns = columns
    sensitivewidget.score_threshold = score_threshold
    sensitivewidget.sample_size = sample_size if detect_infotypes else None
    sensitivewidget.engine = engine_backend

    return sensitivewidget