def from_pd_data_frame(self, local_df): """Make a Sparkling Pandas dataframe from a local Pandas DataFrame. The intend use is for testing or joining distributed data with local data. The types are re-infered, so they may not match. Parameters ---------- local_df: Pandas DataFrame The data to turn into a distributed Sparkling Pandas DataFrame. See http://bit.ly/pandasDataFrame for docs. Returns ------- A Sparkling Pandas DataFrame. """ def frame_to_rows(frame): """Convert a Pandas DataFrame into a list of Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = DataFrame.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df
def from_pd_data_frame(self, local_df): """Make a distributed dataframe from a local dataframe. The intend use is for testing. Note: dtypes are re-infered, so they may not match.""" def frame_to_rows(frame): """Convert a Panda's DataFrame into Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = Dataframe.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df