def from_pd_data_frame(self, local_df): """Make a Sparkling Pandas dataframe from a local Pandas DataFrame. The intend use is for testing or joining distributed data with local data. The types are re-infered, so they may not match. Parameters ---------- local_df: Pandas DataFrame The data to turn into a distributed Sparkling Pandas DataFrame. See http://bit.ly/pandasDataFrame for docs. Returns ------- A Sparkling Pandas DataFrame. """ def frame_to_rows(frame): """Convert a Pandas DataFrame into a list of Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = DataFrame.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df
def from_pd_data_frame(self, local_df): """Make a Sparkling Pandas dataframe from a local Pandas DataFrame. The intend use is for testing or joining distributed data with local data. The types are re-infered, so they may not match. Parameters ---------- local_df: Pandas DataFrame The data to turn into a distributed Sparkling Pandas DataFrame. See http://bit.ly/pandasDataFrame for docs. Returns ------- A Sparkling Pandas DataFrame. """ def frame_to_rows(frame): """Convert a Pandas DataFrame into a list of Spark SQL Rows""" # TODO: Convert to row objects directly? return [r.tolist() for r in frame.to_records()] schema = list(local_df.columns) index_names = list(local_df.index.names) index_names = _normalize_index_names(index_names) schema = index_names + schema rows = self.spark_ctx.parallelize(frame_to_rows(local_df)) sp_df = DataFrame.from_schema_rdd( self.sql_ctx.createDataFrame( rows, schema=schema, # Look at all the rows, should be ok since coming from # a local dataset samplingRatio=1)) sp_df._index_names = index_names return sp_df
def _use_aggregation(self, agg, columns=None): """Compute the result using the aggregation function provided. The aggregation name must also be provided so we can strip of the extra name that Spark SQL adds.""" if not columns: columns = self._columns from pyspark.sql import functions as F aggs = map(lambda column: agg(column).alias(column), self._columns) aggRdd = self._grouped_spark_sql.agg(*aggs) df = DataFrame.from_schema_rdd(aggRdd, self._by) return df
def get_result(self): def list_head(some_list): if some_list: return some_list[0] self._validate_specification() left_rdd_with_suffixes, \ right_rdd_with_suffixes = self._prep_for_merge() def create_condition(left_rdd, right_rdd, left_on, right_on): return getattr(left_rdd, left_on) == \ getattr(right_rdd, right_on) def join_condition(left_rdd, right_rdd, left_on, right_on): condition = create_condition(left_rdd, right_rdd, self.left_on[0], self.right_on[0]) for (a, b) in enumerate(zip(self.left_on[1:], self.right_on[1:])): condition = condition and create_condition( left_rdd, right_rdd, a, b) return condition if self.on is not None: joined = left_rdd_with_suffixes.join(right_rdd_with_suffixes, list_head(self.on), self.how) else: joined = left_rdd_with_suffixes. \ join(right_rdd_with_suffixes, join_condition(left_rdd_with_suffixes, right_rdd_with_suffixes, self.left_on, self.right_on), self.how) if self.sort: # according to spark documentation, we can only sort # by one column if self.on: joined = joined.sort(list_head(self.on)) else: joined = joined.sort(list_head(self.left_on[0])) return DataFrame.from_schema_rdd(joined)