Beispiel #1
0
 def __init__(self, exprs, percentiles_relative_error=1 / 10000):
     self.percentiles_relative_error = percentiles_relative_error
     self.column_stat_helpers = {}
     self.cols = [parse(e) for e in exprs] if exprs else [parse("*")]
     # As python < 3.6 does not guarantee dict ordering
     # we need to keep track of in which order the columns were
     self.col_names = []
Beispiel #2
0
    def merge(self, row, schema):
        for col in self.cols:
            for field in col.output_fields(schema):
                col_name = field.name
                if col_name not in self.column_stat_helpers:
                    self.column_stat_helpers[col_name] = ColumnStatHelper(
                        parse(col_name), self.percentiles_relative_error)
                    self.col_names.append(col_name)
                self.column_stat_helpers[col_name].merge(row, schema)

        return self
Beispiel #3
0
 def pivot(self, pivot_col, values=None):
     """
     >>> from pysparkling import Context, Row
     >>> from pysparkling.sql.session import SparkSession
     >>> from pysparkling.sql.functions import col, avg, sum
     >>> sc = Context()
     >>> spark = SparkSession(sc)
     >>> df4 = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
     ...                        Row(course="Java",   year=2012, earnings=20000),
     ...                        Row(course="dotNET", year=2012, earnings=5000),
     ...                        Row(course="dotNET", year=2013, earnings=48000),
     ...                        Row(course="Java",   year=2013, earnings=30000)]).toDF()
     >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect()
     [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
     >>> df4.groupBy("year").pivot("course").sum("earnings").collect()
     [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]
     >>> df4.groupBy("year").pivot("course", ["dotNET"]).sum("earnings").collect()
     [Row(year=2012, dotNET=15000), Row(year=2013, dotNET=48000)]
     >>> df4.groupBy("year").pivot("course").agg(sum("earnings")).show()
     +----+-----+------+
     |year| Java|dotNET|
     +----+-----+------+
     |2012|20000| 15000|
     |2013|30000| 48000|
     +----+-----+------+
     >>> df4.groupBy("year").pivot("course", ["dotNET", "PHP"]).agg(sum("earnings")).show()
     +----+------+----+
     |year|dotNET| PHP|
     +----+------+----+
     |2012| 15000|null|
     |2013| 48000|null|
     +----+------+----+
     >>> df4.groupBy("year").pivot("course").agg(sum("earnings"), avg("earnings")).show()
     +----+------------------+------------------+--------------------+--------------------+
     |year|Java_sum(earnings)|Java_avg(earnings)|dotNET_sum(earnings)|dotNET_avg(earnings)|
     +----+------------------+------------------+--------------------+--------------------+
     |2012|             20000|           20000.0|               15000|              7500.0|
     |2013|             30000|           30000.0|               48000|             48000.0|
     +----+------------------+------------------+--------------------+--------------------+
     """
     jgd = self._jgd.pivot(parse(pivot_col), values)
     return GroupedData(jgd, self._df)
Beispiel #4
0
    def agg(self, *exprs):
        """
        # >>> sorted(gdf.agg({"*": "count"}).collect())
        # [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]

        >>> from pysparkling import Context, Row
        >>> from pysparkling.sql.session import SparkSession
        >>> from pysparkling.sql.functions import col, avg
        >>> spark = SparkSession(Context())
        >>> df = spark.createDataFrame(
        ...   [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
        ... )
        >>> gdf = df.groupBy(df.name)
        >>> from pysparkling.sql import functions as F
        >>> sorted(gdf.agg(F.min(df.age)).collect())
        [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)]
        >>> df.groupBy("age").agg(avg("age"), col("age")).show()
        +---+--------+---+
        |age|avg(age)|age|
        +---+--------+---+
        |  2|     2.0|  2|
        |  5|     5.0|  5|
        +---+--------+---+

        """
        if not exprs:
            raise ValueError("exprs should not be empty")

        if len(exprs) == 1 and isinstance(exprs[0], dict):
            # pylint: disable=W0511
            # todo implement agg_dict
            jdf = self._jgd.agg_dict(exprs[0])
        else:
            # Columns
            if not all(isinstance(c, Column) for c in exprs):
                raise ValueError("all exprs should be Column")

            # noinspection PyProtectedMember
            jdf = self._jgd.agg([parse(e) for e in exprs])

        return DataFrame(jdf, self.sql_ctx)
Beispiel #5
0
 def sum(self, *cols):
     return self.agg(*(sum(parse(col)) for col in cols))
Beispiel #6
0
 def min(self, *cols):
     return self.agg(*(min(parse(col)) for col in cols))
Beispiel #7
0
 def avg(self, *cols):
     return self.agg(*(avg(parse(col)) for col in cols))