def apply(self, func, *args, **kwargs): """Apply the provided function and combine the results together in the same way as apply from groupby in pandas. This returns a DataFrame. """ self._prep_pandas_groupby() def key_by_index(data): """Key each row by its index. """ # TODO: Is there a better way to do this? for key, row in data.iterrows(): yield (key, pd.DataFrame.from_dict( dict([(key, row)]), orient='index')) myargs = self._myargs mykwargs = self._mykwargs regroupedRDD = self._distributedRDD.mapValues( lambda data: data.groupby(*myargs, **mykwargs)) appliedRDD = regroupedRDD.map( lambda key_data: key_data[1].apply(func, *args, **kwargs)) reKeyedRDD = appliedRDD.flatMap(key_by_index) dataframe = self._sortIfNeeded(reKeyedRDD).values() return DataFrame.fromDataFrameRDD(dataframe, self.sql_ctx)
def from_pd_data_frame(self, local_df): """Make a Sparkling Pandas dataframe from a local Pandas DataFrame. The intend use is for testing or joining distributed data with local data. The types are re-infered, so they may not match. Parameters ---------- local_df: Pandas DataFrame The data to turn into a distributed Sparkling Pandas DataFrame. See http://bit.ly/pandasDataFrame for docs. Returns ------- A Sparkling Pandas DataFrame. """ def frame_to_rows(frame): """Convert a Pandas DataFrame into a list of Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = DataFrame.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.first) myargs = self._myargs mykwargs = self._mykwargs self._prep_pandas_groupby() def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.var(ddof=ddof)), self.sql_ctx)
def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.median()), self.sql_ctx)
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues( lambda r: r.nth( n, *args, **kwargs)).values() return DataFrame.fromDataFrameRDD(nthRDD, self.sql_ctx)
def from_pandas_rdd(self, pandas_rdd): """Create a Sparkling Pandas DataFrame from the provided RDD which is comprised of Panda's DataFrame. Note: the current version drops index information. Parameters ---------- pandas_rdd: RDD[pandas.DataFrame] Returns ------- Sparkling Pandas DataFrame.""" return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
def _use_aggregation(self, agg, columns=None): """Compute the result using the aggregation function provided. The aggregation name must also be provided so we can strip of the extra name that Spark SQL adds.""" if not columns: columns = self._columns from pyspark.sql import functions as F aggs = map(lambda column: agg(column).alias(column), self._columns) aggRdd = self._grouped_spark_sql.agg(*aggs) df = DataFrame.from_schema_rdd(aggRdd, self._by) return df
def table(self, table): """Returns the provided Spark SQL table as a L{DataFrame} Parameters ---------- table: string The name of the Spark SQL table to turn into a L{DataFrame} Returns ------- Sparkling Pandas DataFrame. """ return DataFrame.from_spark_rdd(self.sql_ctx.table(table), self.sql_ctx)
def from_spark_rdd(self, spark_rdd): """ Translates a Spark DataFrame into a Sparkling Pandas Dataframe. Currently, no checking or validation occurs. Parameters ---------- spark_rdd: Spark DataFrame Input Spark DataFrame. Returns ------- Sparkling Pandas DataFrame. """ return DataFrame.from_spark_rdd(spark_rdd, self.sql_ctx)
def sql(self, query): """Perform a SQL query and create a L{DataFrame} of the result. The SQL query is run using Spark SQL. This is not intended for querying arbitrary databases, but rather querying Spark SQL tables. Parameters ---------- query: string The SQL query to pass to Spark SQL to execute. Returns ------- Sparkling Pandas DataFrame. """ return DataFrame.from_spark_rdd(self.sql_ctx.sql(query), self.sql_ctx)
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.mean) self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.mean()), self.sql_ctx)
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation unless we have one of the special cases. Currently the only special case is Series.kurtosis - and even that doesn't properly do partial aggregations, but we can improve it to do this eventually! """ if self._can_use_new_school() and f == pd.Series.kurtosis: self._prep_spark_sql_groupby() import custom_functions as CF return self._use_aggregation(CF.kurtosis) else: self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda g: g.aggregate(f)), self.sql_ctx)
def get_result(self): def list_head(some_list): if some_list: return some_list[0] self._validate_specification() left_rdd_with_suffixes, \ right_rdd_with_suffixes = self._prep_for_merge() def create_condition(left_rdd, right_rdd, left_on, right_on): return getattr(left_rdd, left_on) == \ getattr(right_rdd, right_on) def join_condition(left_rdd, right_rdd, left_on, right_on): condition = create_condition(left_rdd, right_rdd, self.left_on[0], self.right_on[0]) for (a, b) in enumerate(zip(self.left_on[1:], self.right_on[1:])): condition = condition and create_condition( left_rdd, right_rdd, a, b) return condition if self.on is not None: joined = left_rdd_with_suffixes.join(right_rdd_with_suffixes, list_head(self.on), self.how) else: joined = left_rdd_with_suffixes. \ join(right_rdd_with_suffixes, join_condition(left_rdd_with_suffixes, right_rdd_with_suffixes, self.left_on, self.right_on), self.how) if self.sort: # according to spark documentation, we can only sort # by one column if self.on: joined = joined.sort(list_head(self.on)) else: joined = joined.sort(list_head(self.left_on[0])) return DataFrame.from_schema_rdd(joined)
def sum(self): """Compute the sum for each group.""" if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.sum) self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pd.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)
def count(self): """Compute the number of elements in each group.""" if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.count) self._prep_pandas_groupby() myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).count() def merge_value(x, y): return x.append(create_combiner(y)).count() def merge_combiner(x, y): return x.append(y).count(level=0) rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.last) def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)