Example #1
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a DataFrame.
        """
        self._prep_pandas_groupby()

        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pd.DataFrame.from_dict(
                    dict([(key, row)]), orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        dataframe = self._sortIfNeeded(reKeyedRDD).values()
        return DataFrame.fromDataFrameRDD(dataframe, self.sql_ctx)
Example #2
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
Example #3
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.var(ddof=ddof)), self.sql_ctx)
Example #4
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.median()), self.sql_ctx)
Example #5
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     self._prep_pandas_groupby()
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(
             n, *args, **kwargs)).values()
     return DataFrame.fromDataFrameRDD(nthRDD, self.sql_ctx)
Example #6
0
 def from_pandas_rdd(self, pandas_rdd):
     """Create a Sparkling Pandas DataFrame from the provided RDD
     which is comprised of Panda's DataFrame. Note: the current version
     drops index information.
     Parameters
     ----------
     pandas_rdd: RDD[pandas.DataFrame]
     Returns
     -------
     Sparkling Pandas DataFrame."""
     return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
Example #7
0
 def from_pandas_rdd(self, pandas_rdd):
     """Create a Sparkling Pandas DataFrame from the provided RDD
     which is comprised of Panda's DataFrame. Note: the current version
     drops index information.
     Parameters
     ----------
     pandas_rdd: RDD[pandas.DataFrame]
     Returns
     -------
     Sparkling Pandas DataFrame."""
     return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
Example #8
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.mean)
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()), self.sql_ctx)
Example #9
0
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation unless we have one of the special cases.
     Currently the only special case is Series.kurtosis - and even
     that doesn't properly do partial aggregations, but we can improve
     it to do this eventually!
     """
     if self._can_use_new_school() and f == pd.Series.kurtosis:
         self._prep_spark_sql_groupby()
         import custom_functions as CF
         return self._use_aggregation(CF.kurtosis)
     else:
         self._prep_pandas_groupby()
         return DataFrame.fromDataFrameRDD(
             self._regroup_mergedRDD().values().map(
                 lambda g: g.aggregate(f)), self.sql_ctx)
Example #10
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)
Example #11
0
    def count(self):
        """Compute the number of elements in each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.count)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).count()

        def merge_value(x, y):
            return x.append(create_combiner(y)).count()

        def merge_combiner(x, y):
            return x.append(y).count(level=0)

        rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx)
Example #12
0
    def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)