def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w)) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2) ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])