def apply(self, func, *args, **kwargs): """Apply the provided function and combine the results together in the same way as apply from groupby in pandas. This returns a DataFrame. """ self._prep_pandas_groupby() def key_by_index(data): """Key each row by its index. """ # TODO: Is there a better way to do this? for key, row in data.iterrows(): yield (key, pd.DataFrame.from_dict( dict([(key, row)]), orient='index')) myargs = self._myargs mykwargs = self._mykwargs regroupedRDD = self._distributedRDD.mapValues( lambda data: data.groupby(*myargs, **mykwargs)) appliedRDD = regroupedRDD.map( lambda key_data: key_data[1].apply(func, *args, **kwargs)) reKeyedRDD = appliedRDD.flatMap(key_by_index) dataframe = self._sortIfNeeded(reKeyedRDD).values() return DataFrame.fromDataFrameRDD(dataframe, self.sql_ctx)
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.first) myargs = self._myargs mykwargs = self._mykwargs self._prep_pandas_groupby() def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.var(ddof=ddof)), self.sql_ctx)
def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.median()), self.sql_ctx)
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues( lambda r: r.nth( n, *args, **kwargs)).values() return DataFrame.fromDataFrameRDD(nthRDD, self.sql_ctx)
def from_pandas_rdd(self, pandas_rdd): """Create a Sparkling Pandas DataFrame from the provided RDD which is comprised of Panda's DataFrame. Note: the current version drops index information. Parameters ---------- pandas_rdd: RDD[pandas.DataFrame] Returns ------- Sparkling Pandas DataFrame.""" return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.mean) self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.mean()), self.sql_ctx)
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation unless we have one of the special cases. Currently the only special case is Series.kurtosis - and even that doesn't properly do partial aggregations, but we can improve it to do this eventually! """ if self._can_use_new_school() and f == pd.Series.kurtosis: self._prep_spark_sql_groupby() import custom_functions as CF return self._use_aggregation(CF.kurtosis) else: self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda g: g.aggregate(f)), self.sql_ctx)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.last) def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)
def count(self): """Compute the number of elements in each group.""" if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.count) self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).count() def merge_value(x, y): return x.append(create_combiner(y)).count() def merge_combiner(x, y): return x.append(y).count(level=0) rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx)
def sum(self): """Compute the sum for each group.""" if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.sum) self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pd.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)