Example #1
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.median()), self.sql_ctx)
Example #2
0
 def from_spark_rdd(self, spark_rdd, sql_ctx):
     """
     Translates a Spark DataFrame Rdd into a SparklingPandas dataframe.
     :param dataframe_rdd: Input dataframe RDD to convert
     :return: Matchign SparklingPandas dataframe
     """
     return Dataframe.from_spark_rdd(spark_rdd, sql_ctx)
Example #3
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_old_school()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
Example #4
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     self._prep_old_school()
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(lambda r: r.nth(n, *args, **kwargs)).values()
     return Dataframe.fromDataFrameRDD(nthRDD, self.sql_ctx)
Example #5
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)), self.sql_ctx
        )
Example #6
0
    def _use_aggregation(self, agg, columns=None):
        """Compute the result using the aggregation function provided.
        The aggregation name must also be provided so we can strip of the extra
        name that Spark SQL adds."""
        if not columns:
            columns = self._columns
        from pyspark.sql import functions as F

        aggs = map(lambda column: agg(column).alias(column), self._columns)
        aggRdd = self._grouped_spark_sql.agg(*aggs)
        df = Dataframe.from_schema_rdd(aggRdd, self._by)
        return df
Example #7
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.mean)
        self._prep_old_school()
        return Dataframe.fromDataFrameRDD(self._regroup_mergedRDD().values().map(lambda x: x.mean()), self.sql_ctx)
Example #8
0
    def aggregate(self, f):
        """Apply the aggregation function.
        Note: This implementation does note take advantage of partial
        aggregation unless we have one of the special cases.
        Currently the only special case is Series.kurtosis - and even
        that doesn't properly do partial aggregations, but we can improve
        it to do this eventually!
        """
        if self._can_use_new_school() and f == pd.Series.kurtosis:
            self._prep_new_school()
            import custom_functions as CF

            return self._use_aggregation(CF.kurtosis)
        else:
            self._prep_old_school()
            return Dataframe.fromDataFrameRDD(
                self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)), self.sql_ctx
            )
Example #9
0
 def from_pd_data_frame(self, local_df):
     """Make a distributed dataframe from a local dataframe. The intend use
     is for testing. Note: dtypes are re-infered, so they may not match."""
     def frame_to_rows(frame):
         """Convert a Panda's DataFrame into Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = Dataframe.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
Example #10
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a Dataframe.
        """
        self._prep_old_school()

        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pd.DataFrame.from_dict(dict([(key, row)]), orient="index"))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        dataframe = self._sortIfNeeded(reKeyedRDD).values()
        return Dataframe.fromDataFrameRDD(dataframe, self.sql_ctx)
Example #11
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.last)

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfLast, self.sql_ctx)
Example #12
0
    def max(self):
        """Compute the max for each group."""
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.max)
        self._prep_old_school()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfMax, self.sql_ctx)
Example #13
0
    def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_new_school()
            import pyspark.sql.functions as func

            return self._use_aggregation(func.sum)
        self._prep_old_school()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)
        ).values()
        return Dataframe.fromDataFrameRDD(rddOfSum, self.sql_ctx)
Example #14
0
 def _from_pandas_rdd_records(pandas_rdd_records, schema):
     """Create a L{Dataframe} from an RDD of records with schema"""
     return Dataframe.from_spark_rdd(
         self.sql_ctx.createDataFrame(pandas_rdd_records,
                                      schema.values.tolist()),
         self.sql_ctx)
Example #15
0
 def table(self, table):
     """Returns the provided table as a L{Dataframe}"""
     return Dataframe.from_spark_rdd(self.sql_ctx.table(table),
                                     self.sql_ctx)
Example #16
0
 def sql(self, query):
     """Perform a SQL query and create a L{Dataframe} of the result."""
     return Dataframe.from_spark_rdd(self.sql_ctx.sql(query), self.sql_ctx)