Example #1
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
Example #2
0
 def test_window_functions(self):
     df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                       (1, "2")], ["key", "value"])
     w = Window.partitionBy("value").orderBy("key")
     from pyspark.sql import functions as F
     sel = df.select(
         df.value, df.key,
         F.max("key").over(w.rowsBetween(0, 1)),
         F.min("key").over(w.rowsBetween(0, 1)),
         F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
         F.rowNumber().over(w),
         F.rank().over(w),
         F.denseRank().over(w),
         F.ntile(2).over(w))
     rs = sorted(sel.collect())
     expected = [
         ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1),
         ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2)
     ]
     for r, ex in zip(rs, expected):
         self.assertEqual(tuple(r), ex[:len(r)])