コード例 #1
0
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
コード例 #2
0
ファイル: pcontext.py プロジェクト: jhlch/sparklingpandas
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
コード例 #3
0
ファイル: groupby.py プロジェクト: yupbank/sparklingpandas
 def _use_aggregation(self, agg, columns=None):
     """Compute the result using the aggregation function provided.
     The aggregation name must also be provided so we can strip of the extra
     name that Spark SQL adds."""
     if not columns:
         columns = self._columns
     from pyspark.sql import functions as F
     aggs = map(lambda column: agg(column).alias(column), self._columns)
     aggRdd = self._grouped_spark_sql.agg(*aggs)
     df = DataFrame.from_schema_rdd(aggRdd, self._by)
     return df
コード例 #4
0
    def get_result(self):
        def list_head(some_list):
            if some_list:
                return some_list[0]

        self._validate_specification()
        left_rdd_with_suffixes, \
        right_rdd_with_suffixes = self._prep_for_merge()

        def create_condition(left_rdd, right_rdd, left_on, right_on):
            return getattr(left_rdd, left_on) == \
                   getattr(right_rdd, right_on)

        def join_condition(left_rdd, right_rdd, left_on, right_on):
            condition = create_condition(left_rdd, right_rdd, self.left_on[0],
                                         self.right_on[0])
            for (a, b) in enumerate(zip(self.left_on[1:], self.right_on[1:])):
                condition = condition and create_condition(
                    left_rdd, right_rdd, a, b)
            return condition

        if self.on is not None:
            joined = left_rdd_with_suffixes.join(right_rdd_with_suffixes,
                                                 list_head(self.on), self.how)
        else:
            joined = left_rdd_with_suffixes. \
                join(right_rdd_with_suffixes,
                     join_condition(left_rdd_with_suffixes,
                                    right_rdd_with_suffixes, self.left_on,
                                    self.right_on), self.how)
        if self.sort:
            # according to spark documentation, we can only sort
            # by one column
            if self.on:
                joined = joined.sort(list_head(self.on))
            else:
                joined = joined.sort(list_head(self.left_on[0]))
        return DataFrame.from_schema_rdd(joined)