Ejemplo n.º 1
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_old_school()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
Ejemplo n.º 2
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.median()), self.sql_ctx)
Ejemplo n.º 3
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     self._prep_old_school()
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(lambda r: r.nth(n, *args, **kwargs)).values()
     return Dataframe.fromDataFrameRDD(nthRDD, self.sql_ctx)
Ejemplo n.º 4
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)), self.sql_ctx
        )
Ejemplo n.º 5
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.mean)
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.mean()), self.sql_ctx)
Ejemplo n.º 6
0
    def aggregate(self, f):
        """Apply the aggregation function.
        Note: This implementation does note take advantage of partial
        aggregation unless we have one of the special cases.
        Currently the only special case is Series.kurtosis - and even
        that doesn't properly do partial aggregations, but we can improve
        it to do this eventually!
        """
        if self._can_use_new_school() and f == pd.Series.kurtosis:
            self._prep_new_school()
            import custom_functions as CF

            return self._use_aggregation(CF.kurtosis)
        else:
            self._prep_old_school()
            return Dataframe.fromDataFrameRDD(
                self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)), self.sql_ctx
            )
Ejemplo n.º 7
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a Dataframe.
        """
        self._prep_old_school()

        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pd.DataFrame.from_dict(dict([(key, row)]), orient="index"))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        dataframe = self._sortIfNeeded(reKeyedRDD).values()
        return Dataframe.fromDataFrameRDD(dataframe, self.sql_ctx)
Ejemplo n.º 8
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfLast, self.sql_ctx)
Ejemplo n.º 9
0
    def max(self):
        """Compute the max for each group."""
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.max)
        self._prep_old_school()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfMax, self.sql_ctx)
Ejemplo n.º 10
0
    def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.sum)
        self._prep_old_school()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfSum, self.sql_ctx)