Example #1
0
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
Example #2
0
 def from_pd_data_frame(self, local_df):
     """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
     The intend use is for testing or joining distributed data with local
     data.
     The types are re-infered, so they may not match.
     Parameters
     ----------
     local_df: Pandas DataFrame
         The data to turn into a distributed Sparkling Pandas DataFrame.
         See http://bit.ly/pandasDataFrame for docs.
     Returns
     -------
     A Sparkling Pandas DataFrame.
     """
     def frame_to_rows(frame):
         """Convert a Pandas DataFrame into a list of Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = DataFrame.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df
Example #3
0
 def from_pd_data_frame(self, local_df):
     """Make a distributed dataframe from a local dataframe. The intend use
     is for testing. Note: dtypes are re-infered, so they may not match."""
     def frame_to_rows(frame):
         """Convert a Panda's DataFrame into Spark SQL Rows"""
         # TODO: Convert to row objects directly?
         return [r.tolist() for r in frame.to_records()]
     schema = list(local_df.columns)
     index_names = list(local_df.index.names)
     index_names = _normalize_index_names(index_names)
     schema = index_names + schema
     rows = self.spark_ctx.parallelize(frame_to_rows(local_df))
     sp_df = Dataframe.from_schema_rdd(
         self.sql_ctx.createDataFrame(
             rows,
             schema=schema,
             # Look at all the rows, should be ok since coming from
             # a local dataset
             samplingRatio=1))
     sp_df._index_names = index_names
     return sp_df