def __init__(self, exprs, percentiles_relative_error=1 / 10000): self.percentiles_relative_error = percentiles_relative_error self.column_stat_helpers = {} self.cols = [parse(e) for e in exprs] if exprs else [parse("*")] # As python < 3.6 does not guarantee dict ordering # we need to keep track of in which order the columns were self.col_names = []
def merge(self, row, schema): for col in self.cols: for field in col.output_fields(schema): col_name = field.name if col_name not in self.column_stat_helpers: self.column_stat_helpers[col_name] = ColumnStatHelper( parse(col_name), self.percentiles_relative_error) self.col_names.append(col_name) self.column_stat_helpers[col_name].merge(row, schema) return self
def pivot(self, pivot_col, values=None): """ >>> from pysparkling import Context, Row >>> from pysparkling.sql.session import SparkSession >>> from pysparkling.sql.functions import col, avg, sum >>> sc = Context() >>> spark = SparkSession(sc) >>> df4 = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), ... Row(course="Java", year=2012, earnings=20000), ... Row(course="dotNET", year=2012, earnings=5000), ... Row(course="dotNET", year=2013, earnings=48000), ... Row(course="Java", year=2013, earnings=30000)]).toDF() >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect() [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)] >>> df4.groupBy("year").pivot("course").sum("earnings").collect() [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] >>> df4.groupBy("year").pivot("course", ["dotNET"]).sum("earnings").collect() [Row(year=2012, dotNET=15000), Row(year=2013, dotNET=48000)] >>> df4.groupBy("year").pivot("course").agg(sum("earnings")).show() +----+-----+------+ |year| Java|dotNET| +----+-----+------+ |2012|20000| 15000| |2013|30000| 48000| +----+-----+------+ >>> df4.groupBy("year").pivot("course", ["dotNET", "PHP"]).agg(sum("earnings")).show() +----+------+----+ |year|dotNET| PHP| +----+------+----+ |2012| 15000|null| |2013| 48000|null| +----+------+----+ >>> df4.groupBy("year").pivot("course").agg(sum("earnings"), avg("earnings")).show() +----+------------------+------------------+--------------------+--------------------+ |year|Java_sum(earnings)|Java_avg(earnings)|dotNET_sum(earnings)|dotNET_avg(earnings)| +----+------------------+------------------+--------------------+--------------------+ |2012| 20000| 20000.0| 15000| 7500.0| |2013| 30000| 30000.0| 48000| 48000.0| +----+------------------+------------------+--------------------+--------------------+ """ jgd = self._jgd.pivot(parse(pivot_col), values) return GroupedData(jgd, self._df)
def agg(self, *exprs): """ # >>> sorted(gdf.agg({"*": "count"}).collect()) # [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)] >>> from pysparkling import Context, Row >>> from pysparkling.sql.session import SparkSession >>> from pysparkling.sql.functions import col, avg >>> spark = SparkSession(Context()) >>> df = spark.createDataFrame( ... [Row(age=2, name='Alice'), Row(age=5, name='Bob')] ... ) >>> gdf = df.groupBy(df.name) >>> from pysparkling.sql import functions as F >>> sorted(gdf.agg(F.min(df.age)).collect()) [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)] >>> df.groupBy("age").agg(avg("age"), col("age")).show() +---+--------+---+ |age|avg(age)|age| +---+--------+---+ | 2| 2.0| 2| | 5| 5.0| 5| +---+--------+---+ """ if not exprs: raise ValueError("exprs should not be empty") if len(exprs) == 1 and isinstance(exprs[0], dict): # pylint: disable=W0511 # todo implement agg_dict jdf = self._jgd.agg_dict(exprs[0]) else: # Columns if not all(isinstance(c, Column) for c in exprs): raise ValueError("all exprs should be Column") # noinspection PyProtectedMember jdf = self._jgd.agg([parse(e) for e in exprs]) return DataFrame(jdf, self.sql_ctx)
def sum(self, *cols): return self.agg(*(sum(parse(col)) for col in cols))
def min(self, *cols): return self.agg(*(min(parse(col)) for col in cols))
def avg(self, *cols): return self.agg(*(avg(parse(col)) for col in cols))