Ejemplo n.º 1
0
    def __init__(self, df, by, axis, level, as_index, sort, group_keys,
                 squeeze, **kwargs):

        self._columns = df.columns
        self._index = df.index
        self._axis = axis

        self._row_metadata = df._row_metadata
        self._col_metadata = df._col_metadata

        if axis == 0:
            partitions = [column for column in df._block_partitions.T]
            self._index_grouped = pd.Series(self._index, index=self._index)\
                .groupby(by=by, sort=sort)
        else:
            partitions = [row for row in df._block_partitions]
            self._index_grouped = pd.Series(self._columns, index=self._index)\
                .groupby(by=by, sort=sort)

        self._keys_and_values = [(k, v)
                                 for k, v in self._index_grouped]

        self._grouped_partitions = \
            list(zip(*(groupby._submit(args=(by,
                                             axis,
                                             level,
                                             as_index,
                                             sort,
                                             group_keys,
                                             squeeze) + tuple(part.tolist()),
                                       num_return_vals=len(self))
                       for part in partitions)))
Ejemplo n.º 2
0
    def _grouped_partitions(self):

        # It is expensive to put this multiple times, so let's just put it once
        remote_by = ray.put(self._by)

        if len(self._index_grouped) > 1:
            return zip(*(groupby._submit(args=(remote_by,
                                               self._axis,
                                               self._level,
                                               self._as_index,
                                               self._sort,
                                               self._group_keys,
                                               self._squeeze)
                                         + tuple(part.tolist()),
                                         num_return_vals=len(
                                             self._index_grouped))
                         for part in self._partitions))
        elif self._axis == 0:
            return [self._df._col_partitions]
        else:
            return [self._df._row_partitions]
Ejemplo n.º 3
0
    def _grouped_partitions(self):

        # It is expensive to put this multiple times, so let's just put it once
        remote_by = ray.put(self._by)
        remote_index = \
            [ray.put(v.index) for _, v in
             self._df._col_metadata._coord_df.copy().groupby(by='partition')] \
            if self._axis == 0 \
            else [ray.put(v.index) for _, v in
                  self._df._row_metadata._coord_df.copy()
                      .groupby(by='partition')]

        if len(self._index_grouped) > 1:
            return zip(*(
                groupby._submit(args=(remote_index[i], remote_by, self._axis,
                                      self._level, self._as_index, self._sort,
                                      self._group_keys, self._squeeze) +
                                tuple(part.tolist()),
                                num_return_vals=len(self._index_grouped))
                for i, part in enumerate(self._partitions)))
        elif self._axis == 0:
            return [self._df._col_partitions]
        else:
            return [self._df._row_partitions]