Example #1
0
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Continuous)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = {}

    if cfg.stats.enable or cfg.hist.enable:
        data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()
    if cfg.stats.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values
    srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
    if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
        data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        if cfg.insight.enable:
            data["norm"] = normaltest(data["hist"][0])
    if cfg.hist.enable and cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
    # compute only the required amount of quantiles
    if cfg.qqnorm.enable:
        data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    elif cfg.stats.enable:
        data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    elif cfg.box.enable:
        data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
        data["skew"] = skew(srs)
    if cfg.stats.enable or cfg.qqnorm.enable:
        data["mean"] = srs.mean()
        data["std"] = srs.std()
    if cfg.stats.enable:
        data["min"] = srs.min()
        data["max"] = srs.max()
        data["nreals"] = srs.shape[0]
        data["nzero"] = (srs == 0).sum()
        data["nneg"] = (srs < 0).sum()
        data["kurt"] = kurtosis(srs)
        data["mem_use"] = srs.memory_usage(deep=True)
    # compute the density histogram
    if cfg.kde.enable:
        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
        if not math.isclose(
                dask.compute(data["min"])[0],
                dask.compute(data["max"])[0]):
            data["dens"] = da.histogram(srs,
                                        cfg.kde.bins, (srs.min(), srs.max()),
                                        density=True)
            # gaussian kernel density estimate
            data["kde"] = gaussian_kde(
                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                                   meta=srs))
        else:
            data["kde"] = None
    if cfg.box.enable:
        data.update(_calc_box(srs, data["qntls"], cfg))
    if cfg.value_table.enable:
        value_counts = srs.value_counts(sort=False)
        if cfg.stats.enable:
            data["nuniq"] = value_counts.shape[0]
        data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
    elif cfg.stats.enable:
        data["nuniq"] = srs.nunique_approx()

    return data
Example #2
0
def dataframe_from_series_of_record_dict(series_of_record_dicts: Series,
                                         schema: Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]):
    """
    Dictが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。
    schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。

    Args:
        series_of_record_dicts: dask dataframe from series of pandas dataframes.
        schema: schema of resutl dataframe
    Return: dask dataframe
    Examples:
        >>> import dask.dataframe
        >>> import pandas
        >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)})
        >>> print(df)
        ... # doctest: +NORMALIZE_WHITESPACE
             a    b
        0    0  100
        1    1  101
        2    2  102
        3    3  103
        4    4  104
        ..  ..  ...
        95  95  195
        96  96  196
        97  97  197
        98  98  198
        99  99  199
        [100 rows x 2 columns]
        >>> ddf = dask.dataframe.from_pandas(df, npartitions=4)
        >>> def build_df_from_row(row):
        ...     return [{'values': row.a, 'support': row.b},
        ...             {'values': row.a + 1, 'support': row.b},
        ...             {'values': row.a + 2, 'support': row.b}]
        >>> df_ds = ddf.apply(build_df_from_row, axis=1, meta='object')
        >>> print(df_ds)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask Series Structure:
        npartitions=4
        0     object
        25       ...
        50       ...
        75       ...
        99       ...
        dtype: object
        Dask Name: apply, 8 tasks
        >>> result = dataframe_from_series_of_record_dict(df_ds, {'values': 'int', 'support': 'int'})
        >>> print(result)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask DataFrame Structure:
                      values support
        npartitions=4
        0              int64   int64
        25               ...     ...
        50               ...     ...
        75               ...     ...
        99               ...     ...
        Dask Name: create_pandas_dataframe_in_partition, 12 tasks
        >>> print(result.compute())
        ... # doctest: +NORMALIZE_WHITESPACE
            values  support
        0        0      100
        0        1      100
        0        2      100
        1        1      101
        1        2      101
        ..     ...      ...
        98      99      198
        98     100      198
        99      99      199
        99     100      199
        99     101      199
        [300 rows x 2 columns]
    """
    def create_pandas_dataframe_in_partition(series_chunk: pandas.Series):
        records = []
        index = []
        for i, v in series_chunk.iteritems():
            records.extend(v)
            index.extend([i] * len(v))
        df = pandas.DataFrame.from_records(records, index=index)
        return df

    ddf = series_of_record_dicts.map_partitions(create_pandas_dataframe_in_partition, meta=schema)
    return ddf
Example #3
0
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Continuous())

    Parameters
    ----------
    srs
        one numerical column
    bins
        the number of bins in the histogram
    """

    data: Dict[str, Any] = {}

    ## if cfg.stats_enable or cfg.hist_enable or
    # calculate the total number of rows then drop the missing values
    data["nrows"] = srs.shape[0]
    srs = srs.dropna()
    ## if cfg.stats_enable
    # number of not null (present) values
    data["npres"] = srs.shape[0]
    # remove infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # shared computations
    ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable:
    data["min"], data["max"] = srs.min(), srs.max()
    ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
    data["hist"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]])
    ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
    data["norm"] = normaltest(data["hist"][0])
    ## if cfg.qqplot_enable
    data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    ## elif cfg.stats_enable
    ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    ## elif cfg.boxplot_enable
    ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable:
    data["skew"] = skew(srs)

    # if cfg.stats_enable
    data["nuniq"] = srs.nunique()
    data["nreals"] = srs.shape[0]
    data["nzero"] = (srs == 0).sum()
    data["nneg"] = (srs < 0).sum()
    data["mean"] = srs.mean()
    data["std"] = srs.std()
    data["kurt"] = kurtosis(srs)
    data["mem_use"] = srs.memory_usage(deep=True)

    ## if cfg.hist_enable and cfg.insight_enable
    data["chisq"] = chisquare(data["hist"][0])

    # compute the density histogram
    data["dens"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]],
                                density=True)
    # gaussian kernel density estimate
    data["kde"] = gaussian_kde(
        srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                           meta=srs))

    ## if cfg.box_enable
    data.update(calc_box(srs, data["qntls"]))

    return data
Example #4
0
def dataframe_from_series_of_pandas(series_of_pandas_dataframes: Series,
                                    schema: Optional[Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]] = None):
    """
    pandas.DataFrameが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。
    schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。
    series内のdataframeのindexは無視され、元のseriesのindexで上書きされる。

    Args:
        series_of_pandas_dataframes: dask dataframe from series of pandas dataframes.
        schema: schema of resutl dataframe
    Return: dask dataframe
    Examples:
        >>> import dask.dataframe
        >>> import pandas
        >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)})
        >>> print(df)
        ... # doctest: +NORMALIZE_WHITESPACE
             a    b
        0    0  100
        1    1  101
        2    2  102
        3    3  103
        4    4  104
        ..  ..  ...
        95  95  195
        96  96  196
        97  97  197
        98  98  198
        99  99  199
        [100 rows x 2 columns]
        >>> ddf = dask.dataframe.from_pandas(df, npartitions=4)
        >>> def build_df_from_row(row):
        ...     pdf = pandas.DataFrame({'values': [row.a, row.a + 1, row.a + 2], 'support': [row.b, row.b, row.b]})
        ...     return pdf
        >>> df_ds = ddf.apply(build_df_from_row, axis=1)
        >>> print(df_ds)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask Series Structure:
        npartitions=4
        0     object
        25       ...
        50       ...
        75       ...
        99       ...
        dtype: object
        Dask Name: apply, 8 tasks
        >>> result = dataframe_from_series_of_pandas(df_ds)
        >>> print(result)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask DataFrame Structure:
                      values support
        npartitions=4
        0              int64   int64
        25               ...     ...
        50               ...     ...
        75               ...     ...
        99               ...     ...
        Dask Name: create_pandas_dataframe_in_partition, 12 tasks
        >>> print(result.compute())
        ... # doctest: +NORMALIZE_WHITESPACE
            values  support
        0        0      100
        0        1      100
        0        2      100
        1        1      101
        1        2      101
        ..     ...      ...
        98      99      198
        98     100      198
        99      99      199
        99     100      199
        99     101      199
        [300 rows x 2 columns]
    """
    if schema is None:
        schema = series_of_pandas_dataframes.head(1).iloc[0]

    def create_pandas_dataframe_in_partition(series_chunk: pandas.Series):
        for i, v in series_chunk.iteritems():
            v.set_axis([i] * len(v.index), axis=0, inplace=True)
        df = pandas.concat(list(series_chunk), axis=0)
        return df

    ddf = series_of_pandas_dataframes.map_partitions(create_pandas_dataframe_in_partition, meta=schema)
    return ddf