def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_old_school() return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.median()), self.sql_ctx)
def from_spark_rdd(self, spark_rdd, sql_ctx): """ Translates a Spark DataFrame Rdd into a SparklingPandas dataframe. :param dataframe_rdd: Input dataframe RDD to convert :return: Matchign SparklingPandas dataframe """ return Dataframe.from_spark_rdd(spark_rdd, sql_ctx)
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.first) myargs = self._myargs mykwargs = self._mykwargs self._prep_old_school() def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues(lambda r: r.nth(n, *args, **kwargs)).values() return Dataframe.fromDataFrameRDD(nthRDD, self.sql_ctx)
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_old_school() return Dataframe.fromDataFrameRDD( self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)), self.sql_ctx )
def _use_aggregation(self, agg, columns=None): """Compute the result using the aggregation function provided. The aggregation name must also be provided so we can strip of the extra name that Spark SQL adds.""" if not columns: columns = self._columns from pyspark.sql import functions as F aggs = map(lambda column: agg(column).alias(column), self._columns) aggRdd = self._grouped_spark_sql.agg(*aggs) df = Dataframe.from_schema_rdd(aggRdd, self._by) return df
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.mean) self._prep_old_school() return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.mean()), self.sql_ctx)
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation unless we have one of the special cases. Currently the only special case is Series.kurtosis - and even that doesn't properly do partial aggregations, but we can improve it to do this eventually! """ if self._can_use_new_school() and f == pd.Series.kurtosis: self._prep_new_school() import custom_functions as CF return self._use_aggregation(CF.kurtosis) else: self._prep_old_school() return Dataframe.fromDataFrameRDD( self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)), self.sql_ctx )
def from_pd_data_frame(self, local_df): """Make a distributed dataframe from a local dataframe. The intend use is for testing. Note: dtypes are re-infered, so they may not match.""" def frame_to_rows(frame): """Convert a Panda's DataFrame into Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = Dataframe.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df
def apply(self, func, *args, **kwargs): """Apply the provided function and combine the results together in the same way as apply from groupby in pandas. This returns a Dataframe. """ self._prep_old_school() def key_by_index(data): """Key each row by its index. """ # TODO: Is there a better way to do this? for key, row in data.iterrows(): yield (key, pd.DataFrame.from_dict(dict([(key, row)]), orient="index")) myargs = self._myargs mykwargs = self._mykwargs regroupedRDD = self._distributedRDD.mapValues(lambda data: data.groupby(*myargs, **mykwargs)) appliedRDD = regroupedRDD.map(lambda key_data: key_data[1].apply(func, *args, **kwargs)) reKeyedRDD = appliedRDD.flatMap(key_by_index) dataframe = self._sortIfNeeded(reKeyedRDD).values() return Dataframe.fromDataFrameRDD(dataframe, self.sql_ctx)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.last) def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfLast, self.sql_ctx)
def max(self): """Compute the max for each group.""" if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.max) self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).max() def merge_value(x, y): return x.append(create_combiner(y)).max() def merge_combiner(x, y): return x.append(y).max(level=0) rddOfMax = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfMax, self.sql_ctx)
def sum(self): """Compute the sum for each group.""" if self._can_use_new_school(): self._prep_new_school() import pyspark.sql.functions as func return self._use_aggregation(func.sum) self._prep_old_school() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pd.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner) ).values() return Dataframe.fromDataFrameRDD(rddOfSum, self.sql_ctx)
def _from_pandas_rdd_records(pandas_rdd_records, schema): """Create a L{Dataframe} from an RDD of records with schema""" return Dataframe.from_spark_rdd( self.sql_ctx.createDataFrame(pandas_rdd_records, schema.values.tolist()), self.sql_ctx)
def table(self, table): """Returns the provided table as a L{Dataframe}""" return Dataframe.from_spark_rdd(self.sql_ctx.table(table), self.sql_ctx)
def sql(self, query): """Perform a SQL query and create a L{Dataframe} of the result.""" return Dataframe.from_spark_rdd(self.sql_ctx.sql(query), self.sql_ctx)