def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Continuous) """ # pylint: disable=too-many-branches data: Dict[str, Any] = {} if cfg.stats.enable or cfg.hist.enable: data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() if cfg.stats.enable: data["npres"] = srs.shape[0] # number of present (not null) values srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable: data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) if cfg.insight.enable: data["norm"] = normaltest(data["hist"][0]) if cfg.hist.enable and cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) # compute only the required amount of quantiles if cfg.qqnorm.enable: data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) elif cfg.stats.enable: data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) elif cfg.box.enable: data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable: data["skew"] = skew(srs) if cfg.stats.enable or cfg.qqnorm.enable: data["mean"] = srs.mean() data["std"] = srs.std() if cfg.stats.enable: data["min"] = srs.min() data["max"] = srs.max() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) # compute the density histogram if cfg.kde.enable: # To avoid the singular matrix problem, gaussian_kde needs a non-zero std. if not math.isclose( dask.compute(data["min"])[0], dask.compute(data["max"])[0]): data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) else: data["kde"] = None if cfg.box.enable: data.update(_calc_box(srs, data["qntls"], cfg)) if cfg.value_table.enable: value_counts = srs.value_counts(sort=False) if cfg.stats.enable: data["nuniq"] = value_counts.shape[0] data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups) elif cfg.stats.enable: data["nuniq"] = srs.nunique_approx() return data
def dataframe_from_series_of_record_dict(series_of_record_dicts: Series, schema: Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]): """ Dictが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。 schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。 Args: series_of_record_dicts: dask dataframe from series of pandas dataframes. schema: schema of resutl dataframe Return: dask dataframe Examples: >>> import dask.dataframe >>> import pandas >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)}) >>> print(df) ... # doctest: +NORMALIZE_WHITESPACE a b 0 0 100 1 1 101 2 2 102 3 3 103 4 4 104 .. .. ... 95 95 195 96 96 196 97 97 197 98 98 198 99 99 199 [100 rows x 2 columns] >>> ddf = dask.dataframe.from_pandas(df, npartitions=4) >>> def build_df_from_row(row): ... return [{'values': row.a, 'support': row.b}, ... {'values': row.a + 1, 'support': row.b}, ... {'values': row.a + 2, 'support': row.b}] >>> df_ds = ddf.apply(build_df_from_row, axis=1, meta='object') >>> print(df_ds) ... # doctest: +NORMALIZE_WHITESPACE Dask Series Structure: npartitions=4 0 object 25 ... 50 ... 75 ... 99 ... dtype: object Dask Name: apply, 8 tasks >>> result = dataframe_from_series_of_record_dict(df_ds, {'values': 'int', 'support': 'int'}) >>> print(result) ... # doctest: +NORMALIZE_WHITESPACE Dask DataFrame Structure: values support npartitions=4 0 int64 int64 25 ... ... 50 ... ... 75 ... ... 99 ... ... Dask Name: create_pandas_dataframe_in_partition, 12 tasks >>> print(result.compute()) ... # doctest: +NORMALIZE_WHITESPACE values support 0 0 100 0 1 100 0 2 100 1 1 101 1 2 101 .. ... ... 98 99 198 98 100 198 99 99 199 99 100 199 99 101 199 [300 rows x 2 columns] """ def create_pandas_dataframe_in_partition(series_chunk: pandas.Series): records = [] index = [] for i, v in series_chunk.iteritems(): records.extend(v) index.extend([i] * len(v)) df = pandas.DataFrame.from_records(records, index=index) return df ddf = series_of_record_dicts.map_partitions(create_pandas_dataframe_in_partition, meta=schema) return ddf
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Continuous()) Parameters ---------- srs one numerical column bins the number of bins in the histogram """ data: Dict[str, Any] = {} ## if cfg.stats_enable or cfg.hist_enable or # calculate the total number of rows then drop the missing values data["nrows"] = srs.shape[0] srs = srs.dropna() ## if cfg.stats_enable # number of not null (present) values data["npres"] = srs.shape[0] # remove infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # shared computations ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable: data["min"], data["max"] = srs.min(), srs.max() ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable: data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]]) ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable): data["norm"] = normaltest(data["hist"][0]) ## if cfg.qqplot_enable data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) ## elif cfg.stats_enable ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) ## elif cfg.boxplot_enable ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable: data["skew"] = skew(srs) # if cfg.stats_enable data["nuniq"] = srs.nunique() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["mean"] = srs.mean() data["std"] = srs.std() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) ## if cfg.hist_enable and cfg.insight_enable data["chisq"] = chisquare(data["hist"][0]) # compute the density histogram data["dens"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]], density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) ## if cfg.box_enable data.update(calc_box(srs, data["qntls"])) return data
def dataframe_from_series_of_pandas(series_of_pandas_dataframes: Series, schema: Optional[Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]] = None): """ pandas.DataFrameが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。 schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。 series内のdataframeのindexは無視され、元のseriesのindexで上書きされる。 Args: series_of_pandas_dataframes: dask dataframe from series of pandas dataframes. schema: schema of resutl dataframe Return: dask dataframe Examples: >>> import dask.dataframe >>> import pandas >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)}) >>> print(df) ... # doctest: +NORMALIZE_WHITESPACE a b 0 0 100 1 1 101 2 2 102 3 3 103 4 4 104 .. .. ... 95 95 195 96 96 196 97 97 197 98 98 198 99 99 199 [100 rows x 2 columns] >>> ddf = dask.dataframe.from_pandas(df, npartitions=4) >>> def build_df_from_row(row): ... pdf = pandas.DataFrame({'values': [row.a, row.a + 1, row.a + 2], 'support': [row.b, row.b, row.b]}) ... return pdf >>> df_ds = ddf.apply(build_df_from_row, axis=1) >>> print(df_ds) ... # doctest: +NORMALIZE_WHITESPACE Dask Series Structure: npartitions=4 0 object 25 ... 50 ... 75 ... 99 ... dtype: object Dask Name: apply, 8 tasks >>> result = dataframe_from_series_of_pandas(df_ds) >>> print(result) ... # doctest: +NORMALIZE_WHITESPACE Dask DataFrame Structure: values support npartitions=4 0 int64 int64 25 ... ... 50 ... ... 75 ... ... 99 ... ... Dask Name: create_pandas_dataframe_in_partition, 12 tasks >>> print(result.compute()) ... # doctest: +NORMALIZE_WHITESPACE values support 0 0 100 0 1 100 0 2 100 1 1 101 1 2 101 .. ... ... 98 99 198 98 100 198 99 99 199 99 100 199 99 101 199 [300 rows x 2 columns] """ if schema is None: schema = series_of_pandas_dataframes.head(1).iloc[0] def create_pandas_dataframe_in_partition(series_chunk: pandas.Series): for i, v in series_chunk.iteritems(): v.set_axis([i] * len(v.index), axis=0, inplace=True) df = pandas.concat(list(series_chunk), axis=0) return df ddf = series_of_pandas_dataframes.map_partitions(create_pandas_dataframe_in_partition, meta=schema) return ddf