def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.first) myargs = self._myargs mykwargs = self._mykwargs self._prep_old_school() def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_old_school() return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.median()), self.sql_ctx)
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues(lambda r: r.nth(n, *args, **kwargs)).values() return Dataframe.fromDataFrameRDD(nthRDD, self.sql_ctx)
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_old_school() return Dataframe.fromDataFrameRDD( self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)), self.sql_ctx )
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.mean) self._prep_old_school() return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.mean()), self.sql_ctx)
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation unless we have one of the special cases. Currently the only special case is Series.kurtosis - and even that doesn't properly do partial aggregations, but we can improve it to do this eventually! """ if self._can_use_new_school() and f == pd.Series.kurtosis: self._prep_new_school() import custom_functions as CF return self._use_aggregation(CF.kurtosis) else: self._prep_old_school() return Dataframe.fromDataFrameRDD( self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)), self.sql_ctx )
def apply(self, func, *args, **kwargs): """Apply the provided function and combine the results together in the same way as apply from groupby in pandas. This returns a Dataframe. """ self._prep_old_school() def key_by_index(data): """Key each row by its index. """ # TODO: Is there a better way to do this? for key, row in data.iterrows(): yield (key, pd.DataFrame.from_dict(dict([(key, row)]), orient="index")) myargs = self._myargs mykwargs = self._mykwargs regroupedRDD = self._distributedRDD.mapValues(lambda data: data.groupby(*myargs, **mykwargs)) appliedRDD = regroupedRDD.map(lambda key_data: key_data[1].apply(func, *args, **kwargs)) reKeyedRDD = appliedRDD.flatMap(key_by_index) dataframe = self._sortIfNeeded(reKeyedRDD).values() return Dataframe.fromDataFrameRDD(dataframe, self.sql_ctx)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.last) def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfLast, self.sql_ctx)
def max(self): """Compute the max for each group.""" if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.max) self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).max() def merge_value(x, y): return x.append(create_combiner(y)).max() def merge_combiner(x, y): return x.append(y).max(level=0) rddOfMax = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfMax, self.sql_ctx)
def sum(self): """Compute the sum for each group.""" if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.sum) self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pd.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfSum, self.sql_ctx)