Ejemplo n.º 1
0
def test_distribution(data):
    w = dd.distribution(data)
    assert isinstance(
        w, DistributionWidget), "Output was not a DistributionWidget"
    assert isinstance(w.plot_distribution("a"), matplotlib.figure.Figure
                      ), "plot_distribution[numeric] was not a mpl figure"
    assert isinstance(
        w.plot_distribution("a", contrast="e"), matplotlib.figure.Figure
    ), "plot_distribution[numeric] with contrast was not a mpl figure"
    assert isinstance(w.plot_distribution("d"), matplotlib.figure.Figure
                      ), "plot_distribution[categorical] was not a mpl figure"
    assert isinstance(
        w.plot_distribution("d", contrast="e"), matplotlib.figure.Figure
    ), "plot_distribution[categorical] with contrast was not a mpl figure"
    assert w.spike_factor == 10, "Wrong default spike factor"
    assert w.skew_factor == 3, "Wrong default skew factor"
    assert _is_series(w.spike_value), "Spike values not a Pandas series"
    assert _is_series(w.skew_value), "Skew values not a Pandas series"
Ejemplo n.º 2
0
def _modin_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Modin dataframe with metrics in rows
    """
    if _is_series(data):
        data = _compat["modin.pandas"].DataFrame(data)

    if not _is_dataframe(data):
        raise ValueError("Data must be a Modin DataFrame")

    # Save column order
    columns = data.columns
    dtypes = data.agg([lambda x: x.dtype])
    moments = data.agg(["mean", "std", "median"])
    minmax = data.select_dtypes("number").agg(["min",
                                               "max"]).reindex(columns=columns)
    zeros = data.select_dtypes("number").agg([_count_zeros
                                              ]).reindex(columns=columns)
    null_summary = data.agg([_count_nulls])
    freq_summary = data.agg([_most_frequent])

    summary = (dtypes.append(moments, ignore_index=True).append(
        minmax, ignore_index=True).append(zeros, ignore_index=True).append(
            null_summary, ignore_index=True).append(freq_summary,
                                                    ignore_index=True))
    summary = summary[columns]
    summary.index = [
        "Data Type",
        "Mean",
        "Standard Deviation",
        "Median",
        "Min",
        "Max",
        "# Zeros",
        "# Nulls",
        "% Most Frequent Value",
    ]

    # Removing NaNs
    summary.fillna("", inplace=True)

    return SummaryWidget(data, summary)
Ejemplo n.º 3
0
def spikey(data):
    """Calculates the "spikey-ness" of the histogram.

    Spikeyness is the ratio between the tallest bin and the average bin height.

    Args:
        data: The 1-d data array

    Returns:
        Ratio of the tallest bin height and the average bin height.
    """
    if _is_series(data):
        data = data.dropna()
    else:
        data = data[~np.isnan(data)]
    counts, bins = np.histogram(data, bins="sturges")
    return max(counts) / np.mean(counts)
Ejemplo n.º 4
0
def _modin_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Modin dataframe with metrics in rows
    """
    if _is_series(data):
        data = _compat["modin.pandas"].DataFrame(data)

    if not _is_dataframe(data):
        raise ValueError("Data must be a Modin DataFrame")

    info_data = pd.DataFrame(
        {
            "Info": [
                data.shape[0],
                data.shape[1],
                _sizeof_fmt(data.memory_usage().sum(), ""),
            ]
        },
        index=["Rows", "Columns", "Size in Memory"],
    )

    # Save column order
    columns = data.columns

    dtypes = data.dtypes.to_numpy()
    s_mean = data.mean(numeric_only=True).reindex().to_numpy()
    s_sd = data.std(numeric_only=True).reindex(columns).to_numpy()
    s_med = data.median(numeric_only=True).reindex(columns).to_numpy()
    s_min = data.min(numeric_only=True).reindex(columns).to_numpy()
    s_max = data.max(numeric_only=True).reindex(columns).to_numpy()
    s_zero = data[data == 0].fillna(0).sum().astype(int).to_numpy()
    s_null = data.isnull().sum().astype(int).to_numpy()
    s_unique = data.nunique().to_numpy()
    s_freq = (
        data.apply(lambda x: mode1(x.astype("str")))
        .iloc[
            0,
        ]
        .to_numpy()
    )

    summary_data = pd.DataFrame(
        np.vstack(
            [
                dtypes,
                s_null,
                s_zero,
                s_min,
                s_med,
                s_max,
                s_mean,
                s_sd,
                s_unique,
                s_freq,
            ]
        ).transpose(),
        columns=[
            "Data Type",
            "Nulls",
            "Zeros",
            "Min",
            "Median",
            "Max",
            "Mean",
            "Standard Deviation",
            "Unique",
            "Top Frequency",
        ],
        index=columns,
    )

    return SummaryWidget(data, info_data, summary_data)
Ejemplo n.º 5
0
def _pandas_compute_data_summary(data):
    """Perform computation for summary statistics and data description.

    Args:
        data: The dataframe

    Raises:
        ValueError: Invalid input data type.

    Returns:
        The Pandas dataframe with metrics in rows
    """
    if _is_series(data):
        data = pd.DataFrame(data, columns=[data.name])

    if not _is_dataframe(data):
        raise ValueError("Data must be a Pandas DataFrame")

    info_data = pd.DataFrame(
        {
            "Info": [
                data.shape[0],
                data.shape[1],
                _sizeof_fmt(data.memory_usage().sum()),
            ]
        },
        index=["Rows", "Columns", "Size in Memory"],
    )

    columns = data.columns
    val = data.values
    num_columns = data.select_dtypes("number").columns
    num_ind = np.nonzero([c in num_columns for c in columns])[0]
    date_columns = data.select_dtypes(["datetime", "datetimetz"]).columns
    date_ind = np.nonzero([c in date_columns for c in columns])[0]
    other_columns = data.select_dtypes(
        exclude=["number", "datetime", "datetimetz"]
    ).columns
    other_ind = np.nonzero([c in other_columns for c in columns])[0]
    order = np.concatenate([num_ind, date_ind, other_ind], axis=0)

    dtypes = data.dtypes[order]
    s_mean = np.pad(
        np.mean(val[:, num_ind], axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_sd = np.pad(
        np.std(val[:, num_ind].astype(np.float), axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_med = np.pad(
        np.median(val[:, num_ind], axis=0),
        (0, len(data.columns) - num_ind.size),
        constant_values=np.nan,
    )
    s_min = np.pad(
        np.min(val[:, np.concatenate([num_ind, date_ind])], axis=0),
        (0, len(data.columns) - num_ind.size - date_ind.size),
        constant_values=np.nan,
    )
    s_max = np.pad(
        np.max(val[:, np.concatenate([num_ind, date_ind])], axis=0),
        (0, len(data.columns) - num_ind.size - date_ind.size),
        constant_values=np.nan,
    )
    s_zero = data[data == 0].fillna(0).sum().astype(int)[order]
    s_null = data.isnull().sum().astype(int)[order]
    s_unique = data.nunique()[order]
    s_freq = np.apply_along_axis(mode1, 0, val.astype("str"))[order]

    summary_data = pd.DataFrame(
        np.vstack(
            [
                dtypes,
                s_null,
                s_zero,
                s_min,
                s_med,
                s_max,
                s_mean,
                s_sd,
                s_unique,
                s_freq,
            ]
        ).transpose()[np.argsort(order), :],
        columns=[
            "Data Type",
            "Nulls",
            "Zeros",
            "Min",
            "Median",
            "Max",
            "Mean",
            "Standard Deviation",
            "Unique",
            "Top Frequency",
        ],
        index=data.columns,
    )

    return SummaryWidget(data, info_data, summary_data)