Esempio n. 1
0
    def _aca_agg(self, token, func, aggfunc=None):
        if aggfunc is None:
            aggfunc = func

        if isinstance(self.index, Series):

            def chunk(df, index, func=func, key=self.key):
                if isinstance(df, pd.Series):
                    return func(df.groupby(index))
                else:
                    return func(df.groupby(index)[key])

            agg = lambda df: aggfunc(df.groupby(level=0))
            token = self._token_prefix + token

            return aca([self.df, self.index], chunk=chunk, aggregate=agg, columns=self.key, token=token)
        else:

            def chunk(df, index=self.index, func=func, key=self.key):
                return func(df.groupby(index)[key])

            if isinstance(self.index, list):
                levels = list(range(len(self.index)))
            else:
                levels = 0
            agg = lambda df: aggfunc(df.groupby(level=levels))
            token = self._token_prefix + token

            return aca(self.df, chunk=chunk, aggregate=agg, columns=self.key, token=token)
Esempio n. 2
0
def test_deterministic_apply_concat_apply_names():
    df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    assert sorted(a.x.nlargest(2).dask) == sorted(a.x.nlargest(2).dask)
    assert sorted(a.x.nlargest(2).dask) != sorted(a.x.nlargest(3).dask)
    assert sorted(a.x.drop_duplicates().dask) == sorted(a.x.drop_duplicates().dask)
    assert sorted(a.groupby("x").y.mean().dask) == sorted(a.groupby("x").y.mean().dask)
    # Test aca without passing in token string
    f = lambda a: a.nlargest(5)
    f2 = lambda a: a.nlargest(3)
    assert sorted(aca(a.x, f, f, a.x.name).dask) != sorted(aca(a.x, f2, f2, a.x.name).dask)
    assert sorted(aca(a.x, f, f, a.x.name).dask) == sorted(aca(a.x, f, f, a.x.name).dask)
Esempio n. 3
0
    def nunique(self):
        def chunk(df, index):
            # we call set_index here to force a possibly duplicate index
            # for our reduce step
            if isinstance(df, pd.DataFrame):
                grouped = (df.groupby(index).apply(
                    pd.DataFrame.drop_duplicates, subset=self.key))
                grouped.index = grouped.index.get_level_values(level=0)
            else:
                if isinstance(index, np.ndarray):
                    assert len(index) == len(df)
                    index = pd.Series(index, index=df.index)
                grouped = pd.concat([df, index], axis=1).drop_duplicates()
            return grouped

        def agg(df):
            if isinstance(self.df, Series):
                return df.groupby(df.columns[1])[df.columns[0]].nunique()
            else:
                return df.groupby(level=0)[self.key].nunique()

        return aca([self.df, self.index],
                   chunk=chunk,
                   aggregate=agg,
                   columns=self.key,
                   token='series-groupby-nunique')
Esempio n. 4
0
    def nunique(self):
        def chunk(df, index, key):
            # we call set_index here to force a possibly duplicate index
            # for our reduce step
            if isinstance(df, pd.DataFrame):
                grouped = (df.groupby(index)
                        .apply(pd.DataFrame.drop_duplicates, subset=key))
                grouped.index = grouped.index.get_level_values(level=0)
            else:
                if isinstance(index, np.ndarray):
                    assert len(index) == len(df)
                    index = pd.Series(index, index=df.index)
                grouped = pd.concat([df, index], axis=1).drop_duplicates()
            return grouped

        key = self.key
        is_series = isinstance(self.df, Series)

        def agg(df):
            if is_series:
                return df.groupby(df.columns[1])[df.columns[0]].nunique()
            else:
                return df.groupby(level=0)[key].nunique()

        return aca([self.df, self.index, self.key],
                   chunk=chunk, aggregate=agg, columns=self.key,
                   token='series-groupby-nunique')
Esempio n. 5
0
def test_deterministic_apply_concat_apply_names():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    assert sorted(a.x.nlargest(2).dask) == sorted(a.x.nlargest(2).dask)
    assert sorted(a.x.nlargest(2).dask) != sorted(a.x.nlargest(3).dask)
    assert sorted(a.x.drop_duplicates().dask) == \
           sorted(a.x.drop_duplicates().dask)
    assert sorted(a.groupby('x').y.mean().dask) == \
           sorted(a.groupby('x').y.mean().dask)
    # Test aca without passing in token string
    f = lambda a: a.nlargest(5)
    f2 = lambda a: a.nlargest(3)
    assert sorted(aca(a.x, f, f, a.x.name).dask) !=\
           sorted(aca(a.x, f2, f2, a.x.name).dask)
    assert sorted(aca(a.x, f, f, a.x.name).dask) ==\
           sorted(aca(a.x, f, f, a.x.name).dask)
Esempio n. 6
0
    def nunique(self):
        name = self._pd.obj.name

        if isinstance(self.obj, DataFrame):

            def agg(df):
                return df.groupby(level=0)[name].nunique()

            return aca([self.obj, self.index],
                       chunk=_nunique_df_chunk, aggregate=agg,
                       columns=name, token='series-groupby-nunique')
        else:

            def agg(df):
                return df.groupby(df.columns[1])[df.columns[0]].nunique()

            return aca([self.obj, self.index],
                       chunk=_nunique_series_chunk, aggregate=agg,
                       columns=name, token='series-groupby-nunique')
Esempio n. 7
0
    def nunique(self):
        name = self._pd.obj.name

        if isinstance(self.obj, DataFrame):

            def agg(df):
                return df.groupby(level=0)[name].nunique()

            return aca([self.obj, self.index],
                       chunk=_nunique_df_chunk, aggregate=agg,
                       columns=name, token='series-groupby-nunique')
        else:

            def agg(df):
                return df.groupby(df.columns[1])[df.columns[0]].nunique()

            return aca([self.obj, self.index],
                       chunk=_nunique_series_chunk, aggregate=agg,
                       columns=name, token='series-groupby-nunique')
Esempio n. 8
0
    def _aca_agg(self, token, func, aggfunc=None):
        if aggfunc is None:
            aggfunc = func

        if isinstance(self.index, Series):

            def chunk(df, index, func=func, key=self.key):
                if isinstance(df, pd.Series):
                    return func(df.groupby(index))
                else:
                    return func(df.groupby(index)[key])

            agg = lambda df: aggfunc(df.groupby(level=0))
            token = self._token_prefix + token

            return aca([self.df, self.index],
                       chunk=chunk,
                       aggregate=agg,
                       columns=self.key,
                       token=token)
        else:

            def chunk(df, index=self.index, func=func, key=self.key):
                return func(df.groupby(index)[key])

            if isinstance(self.index, list):
                levels = list(range(len(self.index)))
            else:
                levels = 0
            agg = lambda df: aggfunc(df.groupby(level=levels))
            token = self._token_prefix + token

            return aca(self.df,
                       chunk=chunk,
                       aggregate=agg,
                       columns=self.key,
                       token=token)
Esempio n. 9
0
    def var(self, ddof=1):
        from functools import partial
        meta = self.obj._pd
        if isinstance(meta, pd.Series):
            meta = meta.to_frame()
        meta = meta.groupby(self.index).var(ddof=1)
        result = aca([self.obj, self.index], _var_chunk,
                     partial(_var_agg, ddof=ddof), meta,
                     token=self._token_prefix + 'var')

        if isinstance(self.obj, Series):
            result = result[result.columns[0]]
        if self._slice:
            result = result[self._slice]

        return result
Esempio n. 10
0
    def var(self, ddof=1):
        from functools import partial
        meta = self.obj._pd
        if isinstance(meta, pd.Series):
            meta = meta.to_frame()
        meta = meta.groupby(self.index).var(ddof=1)
        result = aca([self.obj, self.index],
                     _var_chunk,
                     partial(_var_agg, ddof=ddof),
                     meta,
                     token=self._token_prefix + 'var')

        if isinstance(self.obj, Series):
            result = result[result.columns[0]]
        if self._slice:
            result = result[self._slice]

        return result
Esempio n. 11
0
    def _aca_agg(self, token, func, aggfunc=None):
        if aggfunc is None:
            aggfunc = func

        dummy = func(self._pd)
        columns = dummy.name if isinstance(dummy, pd.Series) else dummy.columns

        token = self._token_prefix + token

        if isinstance(self.index, list):
            levels = list(range(len(self.index)))
        else:
            levels = 0

        agg = lambda df: aggfunc(df.groupby(level=levels))

        return aca([self.obj, self.index, func, columns],
                   chunk=_apply_chunk, aggregate=agg,
                   columns=dummy, token=token)
Esempio n. 12
0
    def _aca_agg(self, token, func, aggfunc=None):
        if aggfunc is None:
            aggfunc = func

        dummy = func(self._pd)
        columns = dummy.name if isinstance(dummy, pd.Series) else dummy.columns

        token = self._token_prefix + token

        if isinstance(self.index, list):
            levels = list(range(len(self.index)))
        else:
            levels = 0

        agg = lambda df: aggfunc(df.groupby(level=levels))

        return aca([self.obj, self.index, func, columns],
                   chunk=_apply_chunk, aggregate=agg,
                   columns=dummy, token=token)