def test_distribution(data): w = dd.distribution(data) assert isinstance( w, DistributionWidget), "Output was not a DistributionWidget" assert isinstance(w.plot_distribution("a"), matplotlib.figure.Figure ), "plot_distribution[numeric] was not a mpl figure" assert isinstance( w.plot_distribution("a", contrast="e"), matplotlib.figure.Figure ), "plot_distribution[numeric] with contrast was not a mpl figure" assert isinstance(w.plot_distribution("d"), matplotlib.figure.Figure ), "plot_distribution[categorical] was not a mpl figure" assert isinstance( w.plot_distribution("d", contrast="e"), matplotlib.figure.Figure ), "plot_distribution[categorical] with contrast was not a mpl figure" assert w.spike_factor == 10, "Wrong default spike factor" assert w.skew_factor == 3, "Wrong default skew factor" assert _is_series(w.spike_value), "Spike values not a Pandas series" assert _is_series(w.skew_value), "Skew values not a Pandas series"
def _modin_compute_data_summary(data): """Perform computation for summary statistics and data description. Args: data: The dataframe Raises: ValueError: Invalid input data type. Returns: The Modin dataframe with metrics in rows """ if _is_series(data): data = _compat["modin.pandas"].DataFrame(data) if not _is_dataframe(data): raise ValueError("Data must be a Modin DataFrame") # Save column order columns = data.columns dtypes = data.agg([lambda x: x.dtype]) moments = data.agg(["mean", "std", "median"]) minmax = data.select_dtypes("number").agg(["min", "max"]).reindex(columns=columns) zeros = data.select_dtypes("number").agg([_count_zeros ]).reindex(columns=columns) null_summary = data.agg([_count_nulls]) freq_summary = data.agg([_most_frequent]) summary = (dtypes.append(moments, ignore_index=True).append( minmax, ignore_index=True).append(zeros, ignore_index=True).append( null_summary, ignore_index=True).append(freq_summary, ignore_index=True)) summary = summary[columns] summary.index = [ "Data Type", "Mean", "Standard Deviation", "Median", "Min", "Max", "# Zeros", "# Nulls", "% Most Frequent Value", ] # Removing NaNs summary.fillna("", inplace=True) return SummaryWidget(data, summary)
def spikey(data): """Calculates the "spikey-ness" of the histogram. Spikeyness is the ratio between the tallest bin and the average bin height. Args: data: The 1-d data array Returns: Ratio of the tallest bin height and the average bin height. """ if _is_series(data): data = data.dropna() else: data = data[~np.isnan(data)] counts, bins = np.histogram(data, bins="sturges") return max(counts) / np.mean(counts)
def _modin_compute_data_summary(data): """Perform computation for summary statistics and data description. Args: data: The dataframe Raises: ValueError: Invalid input data type. Returns: The Modin dataframe with metrics in rows """ if _is_series(data): data = _compat["modin.pandas"].DataFrame(data) if not _is_dataframe(data): raise ValueError("Data must be a Modin DataFrame") info_data = pd.DataFrame( { "Info": [ data.shape[0], data.shape[1], _sizeof_fmt(data.memory_usage().sum(), ""), ] }, index=["Rows", "Columns", "Size in Memory"], ) # Save column order columns = data.columns dtypes = data.dtypes.to_numpy() s_mean = data.mean(numeric_only=True).reindex().to_numpy() s_sd = data.std(numeric_only=True).reindex(columns).to_numpy() s_med = data.median(numeric_only=True).reindex(columns).to_numpy() s_min = data.min(numeric_only=True).reindex(columns).to_numpy() s_max = data.max(numeric_only=True).reindex(columns).to_numpy() s_zero = data[data == 0].fillna(0).sum().astype(int).to_numpy() s_null = data.isnull().sum().astype(int).to_numpy() s_unique = data.nunique().to_numpy() s_freq = ( data.apply(lambda x: mode1(x.astype("str"))) .iloc[ 0, ] .to_numpy() ) summary_data = pd.DataFrame( np.vstack( [ dtypes, s_null, s_zero, s_min, s_med, s_max, s_mean, s_sd, s_unique, s_freq, ] ).transpose(), columns=[ "Data Type", "Nulls", "Zeros", "Min", "Median", "Max", "Mean", "Standard Deviation", "Unique", "Top Frequency", ], index=columns, ) return SummaryWidget(data, info_data, summary_data)
def _pandas_compute_data_summary(data): """Perform computation for summary statistics and data description. Args: data: The dataframe Raises: ValueError: Invalid input data type. Returns: The Pandas dataframe with metrics in rows """ if _is_series(data): data = pd.DataFrame(data, columns=[data.name]) if not _is_dataframe(data): raise ValueError("Data must be a Pandas DataFrame") info_data = pd.DataFrame( { "Info": [ data.shape[0], data.shape[1], _sizeof_fmt(data.memory_usage().sum()), ] }, index=["Rows", "Columns", "Size in Memory"], ) columns = data.columns val = data.values num_columns = data.select_dtypes("number").columns num_ind = np.nonzero([c in num_columns for c in columns])[0] date_columns = data.select_dtypes(["datetime", "datetimetz"]).columns date_ind = np.nonzero([c in date_columns for c in columns])[0] other_columns = data.select_dtypes( exclude=["number", "datetime", "datetimetz"] ).columns other_ind = np.nonzero([c in other_columns for c in columns])[0] order = np.concatenate([num_ind, date_ind, other_ind], axis=0) dtypes = data.dtypes[order] s_mean = np.pad( np.mean(val[:, num_ind], axis=0), (0, len(data.columns) - num_ind.size), constant_values=np.nan, ) s_sd = np.pad( np.std(val[:, num_ind].astype(np.float), axis=0), (0, len(data.columns) - num_ind.size), constant_values=np.nan, ) s_med = np.pad( np.median(val[:, num_ind], axis=0), (0, len(data.columns) - num_ind.size), constant_values=np.nan, ) s_min = np.pad( np.min(val[:, np.concatenate([num_ind, date_ind])], axis=0), (0, len(data.columns) - num_ind.size - date_ind.size), constant_values=np.nan, ) s_max = np.pad( np.max(val[:, np.concatenate([num_ind, date_ind])], axis=0), (0, len(data.columns) - num_ind.size - date_ind.size), constant_values=np.nan, ) s_zero = data[data == 0].fillna(0).sum().astype(int)[order] s_null = data.isnull().sum().astype(int)[order] s_unique = data.nunique()[order] s_freq = np.apply_along_axis(mode1, 0, val.astype("str"))[order] summary_data = pd.DataFrame( np.vstack( [ dtypes, s_null, s_zero, s_min, s_med, s_max, s_mean, s_sd, s_unique, s_freq, ] ).transpose()[np.argsort(order), :], columns=[ "Data Type", "Nulls", "Zeros", "Min", "Median", "Max", "Mean", "Standard Deviation", "Unique", "Top Frequency", ], index=data.columns, ) return SummaryWidget(data, info_data, summary_data)