Beispiel #1
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a DataFrame.
        """
        self._prep_pandas_groupby()

        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pd.DataFrame.from_dict(
                    dict([(key, row)]), orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        dataframe = self._sortIfNeeded(reKeyedRDD).values()
        return DataFrame.fromDataFrameRDD(dataframe, self.sql_ctx)
Beispiel #2
0
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
Beispiel #3
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
Beispiel #5
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.var(ddof=ddof)), self.sql_ctx)
Beispiel #6
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.median()), self.sql_ctx)
Beispiel #7
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     self._prep_pandas_groupby()
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(
             n, *args, **kwargs)).values()
     return DataFrame.fromDataFrameRDD(nthRDD, self.sql_ctx)
Beispiel #8
0
 def from_pandas_rdd(self, pandas_rdd):
     """Create a Sparkling Pandas DataFrame from the provided RDD
     which is comprised of Panda's DataFrame. Note: the current version
     drops index information.
     Parameters
     ----------
     pandas_rdd: RDD[pandas.DataFrame]
     Returns
     -------
     Sparkling Pandas DataFrame."""
     return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
 def from_pandas_rdd(self, pandas_rdd):
     """Create a Sparkling Pandas DataFrame from the provided RDD
     which is comprised of Panda's DataFrame. Note: the current version
     drops index information.
     Parameters
     ----------
     pandas_rdd: RDD[pandas.DataFrame]
     Returns
     -------
     Sparkling Pandas DataFrame."""
     return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
Beispiel #10
0
 def _use_aggregation(self, agg, columns=None):
     """Compute the result using the aggregation function provided.
     The aggregation name must also be provided so we can strip of the extra
     name that Spark SQL adds."""
     if not columns:
         columns = self._columns
     from pyspark.sql import functions as F
     aggs = map(lambda column: agg(column).alias(column), self._columns)
     aggRdd = self._grouped_spark_sql.agg(*aggs)
     df = DataFrame.from_schema_rdd(aggRdd, self._by)
     return df
Beispiel #11
0
 def table(self, table):
     """Returns the provided Spark SQL table as a L{DataFrame}
     Parameters
     ----------
     table: string
         The name of the Spark SQL table to turn into a L{DataFrame}
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(self.sql_ctx.table(table),
                                     self.sql_ctx)
 def table(self, table):
     """Returns the provided Spark SQL table as a L{DataFrame}
     Parameters
     ----------
     table: string
         The name of the Spark SQL table to turn into a L{DataFrame}
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(self.sql_ctx.table(table),
                                     self.sql_ctx)
Beispiel #13
0
 def from_spark_rdd(self, spark_rdd):
     """
     Translates a Spark DataFrame into a Sparkling Pandas Dataframe.
     Currently, no checking or validation occurs.
     Parameters
     ----------
     spark_rdd: Spark DataFrame
         Input Spark DataFrame.
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(spark_rdd, self.sql_ctx)
Beispiel #14
0
 def sql(self, query):
     """Perform a SQL query and create a L{DataFrame} of the result.
     The SQL query is run using Spark SQL. This is not intended for
     querying arbitrary databases, but rather querying Spark SQL tables.
     Parameters
     ----------
     query: string
         The SQL query to pass to Spark SQL to execute.
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(self.sql_ctx.sql(query), self.sql_ctx)
 def from_spark_rdd(self, spark_rdd):
     """
     Translates a Spark DataFrame into a Sparkling Pandas Dataframe.
     Currently, no checking or validation occurs.
     Parameters
     ----------
     spark_rdd: Spark DataFrame
         Input Spark DataFrame.
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(spark_rdd, self.sql_ctx)
 def sql(self, query):
     """Perform a SQL query and create a L{DataFrame} of the result.
     The SQL query is run using Spark SQL. This is not intended for
     querying arbitrary databases, but rather querying Spark SQL tables.
     Parameters
     ----------
     query: string
         The SQL query to pass to Spark SQL to execute.
     Returns
     -------
     Sparkling Pandas DataFrame.
     """
     return DataFrame.from_spark_rdd(self.sql_ctx.sql(query), self.sql_ctx)
Beispiel #17
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.mean)
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()), self.sql_ctx)
Beispiel #18
0
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation unless we have one of the special cases.
     Currently the only special case is Series.kurtosis - and even
     that doesn't properly do partial aggregations, but we can improve
     it to do this eventually!
     """
     if self._can_use_new_school() and f == pd.Series.kurtosis:
         self._prep_spark_sql_groupby()
         import custom_functions as CF
         return self._use_aggregation(CF.kurtosis)
     else:
         self._prep_pandas_groupby()
         return DataFrame.fromDataFrameRDD(
             self._regroup_mergedRDD().values().map(
                 lambda g: g.aggregate(f)), self.sql_ctx)
Beispiel #19
0
    def get_result(self):
        def list_head(some_list):
            if some_list:
                return some_list[0]

        self._validate_specification()
        left_rdd_with_suffixes, \
        right_rdd_with_suffixes = self._prep_for_merge()

        def create_condition(left_rdd, right_rdd, left_on, right_on):
            return getattr(left_rdd, left_on) == \
                   getattr(right_rdd, right_on)

        def join_condition(left_rdd, right_rdd, left_on, right_on):
            condition = create_condition(left_rdd, right_rdd, self.left_on[0],
                                         self.right_on[0])
            for (a, b) in enumerate(zip(self.left_on[1:], self.right_on[1:])):
                condition = condition and create_condition(
                    left_rdd, right_rdd, a, b)
            return condition

        if self.on is not None:
            joined = left_rdd_with_suffixes.join(right_rdd_with_suffixes,
                                                 list_head(self.on), self.how)
        else:
            joined = left_rdd_with_suffixes. \
                join(right_rdd_with_suffixes,
                     join_condition(left_rdd_with_suffixes,
                                    right_rdd_with_suffixes, self.left_on,
                                    self.right_on), self.how)
        if self.sort:
            # according to spark documentation, we can only sort
            # by one column
            if self.on:
                joined = joined.sort(list_head(self.on))
            else:
                joined = joined.sort(list_head(self.left_on[0]))
        return DataFrame.from_schema_rdd(joined)
Beispiel #20
0
    def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)
Beispiel #21
0
    def count(self):
        """Compute the number of elements in each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.count)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).count()

        def merge_value(x, y):
            return x.append(create_combiner(y)).count()

        def merge_combiner(x, y):
            return x.append(y).count(level=0)

        rddOfCounts = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfCounts, self.sql_ctx)
Beispiel #22
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfLast, self.sql_ctx)