Example #1
0
    def test_gapply_all_cols(self):
        schema = StructType().add("val2", LongType())
        pandasDF = pd.DataFrame.from_dict({
            "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)],
            "val1": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)],
            "val2": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]})
        df = self.spark.createDataFrame(pandasDF)
        gd = df.groupBy("key")

        def func(_, vals):
            assert vals.columns.tolist() == ["val1", "val2"], vals.columns
            return pd.DataFrame.from_records([(vals["val2"].sum(),)])

        expected = pandasDF.groupby("key", as_index=False).agg({"val2": "sum"})
        actual = gapply(gd, func, schema).toPandas()
        _assertPandasAlmostEqual(actual, expected)

        def func(_, vals):
            assert vals.columns.tolist() == ["val2", "val1"], vals.columns
            return pd.DataFrame.from_records([(vals["val2"].sum(),)])

        gd = df.select("val2", "key", "val1").groupBy("key")

        actual = gapply(gd, func, schema).toPandas()
        _assertPandasAlmostEqual(actual, expected)
Example #2
0
    def test_gapply_all_cols(self):
        schema = StructType().add("val2", LongType())
        pandasDF = pd.DataFrame.from_dict({
            "key": [
                random.randrange(GapplyTests.NKEYS)
                for _ in range(GapplyTests.NROWS)
            ],
            "val1": [
                random.randrange(GapplyTests.NVALS)
                for _ in range(GapplyTests.NROWS)
            ],
            "val2": [
                random.randrange(GapplyTests.NVALS)
                for _ in range(GapplyTests.NROWS)
            ]
        })
        df = self.spark.createDataFrame(pandasDF)
        gd = df.groupBy("key")

        def func(key, vals):
            assert vals.columns.tolist() == ["val1", "val2"], vals.columns
            return pd.DataFrame.from_records([(vals["val2"].sum(), )])

        expected = pandasDF.groupby("key", as_index=False).agg({"val2": "sum"})
        actual = gapply(gd, func, schema).toPandas()
        _assertPandasAlmostEqual(actual, expected)

        def func(key, vals):
            assert vals.columns.tolist() == ["val2", "val1"], vals.columns
            return pd.DataFrame.from_records([(vals["val2"].sum(), )])

        gd = df.select("val2", "key", "val1").groupBy("key")
        actual = gapply(gd, func, schema).toPandas()
        _assertPandasAlmostEqual(actual, expected)
Example #3
0
 def test_gapply_empty_schema(self):
     longLongSchema = StructType().add("a", LongType()).add("b", LongType())
     emptyLongLongDF = self.spark.createDataFrame([(1, 2)],
                                                  schema=longLongSchema)
     gd = emptyLongLongDF.groupBy("a")
     self.assertEqual(
         gapply(gd, _emptyFunc, StructType(), "b").collect(), [])
Example #4
0
    def test_gapply_double_key(self):
        schema = StructType().add("val", LongType())
        randKeys = [
            random.randrange(GapplyTests.NKEYS)
            for _ in range(GapplyTests.NROWS)
        ]
        pandasDF = pd.DataFrame.from_dict({
            "key1":
            randKeys,
            "key2": [GapplyTests.NKEYS + x for x in randKeys],
            "val": [
                random.randrange(GapplyTests.NVALS)
                for _ in range(GapplyTests.NROWS)
            ]
        })
        gd = self.spark.createDataFrame(pandasDF).groupBy("key2", "key1")

        def func(keys, vals):
            assert keys[0] == keys[1] + GapplyTests.NKEYS
            return pd.DataFrame.from_records([(vals["val"].sum(), )])

        expected = pandasDF.groupby(["key2", "key1"],
                                    as_index=False).agg({"val": "sum"})
        actual = gapply(gd, func, schema, "val").toPandas()
        _assertPandasAlmostEqual(actual, expected)
Example #5
0
 def test_gapply_empty(self):
     # Implicitly checks that pandas version is large enough (unit tests for the actual version
     # checking itself would require some serious mocking)
     longLongSchema = StructType().add("a", LongType()).add("b", LongType())
     emptyLongLongDF = self.spark.createDataFrame([], schema=longLongSchema)
     gd = emptyLongLongDF.groupBy("a")
     self.assertEqual(gapply(gd, _emptyFunc, longLongSchema, "b").collect(), [])
Example #6
0
 def test_gapply_empty(self):
     # Implicitly checks that pandas version is large enough (unit tests for the actual version
     # checking itself would require some serious mocking)
     longLongSchema = StructType().add("a", LongType()).add("b", LongType())
     emptyLongLongDF = self.spark.createDataFrame([], schema=longLongSchema)
     gd = emptyLongLongDF.groupBy("a")
     self.assertEqual(
         gapply(gd, _emptyFunc, longLongSchema, "b").collect(), [])
Example #7
0
 def checkGapplyEquivalentToPandas(self, pandasAggFunction, dataType, dataGen):
     schema = StructType().add("val", dataType)
     pandasDF = pd.DataFrame.from_dict({
         "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)],
         "val": [dataGen() for _ in range(GapplyTests.NROWS)]})
     gd = self.spark.createDataFrame(pandasDF).groupBy("key")
     def func(key, vals):
         return pd.DataFrame.from_records([(pandasAggFunction(vals["val"]),)])
     expected = pandasDF.groupby("key", as_index=False).agg({"val": pandasAggFunction})
     actual = gapply(gd, func, schema, "val").toPandas()
     _assertPandasAlmostEqual(actual, expected)
Example #8
0
 def test_gapply_no_keys(self):
     schema = StructType().add("val", LongType())
     pandasDF = pd.DataFrame.from_dict({
         "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)],
         "val": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]})
     gd = self.spark.createDataFrame(pandasDF).groupBy("key")
     def func(_, vals):
         return pd.DataFrame.from_records([(vals["val"].sum(),)])
     expected = pandasDF.groupby("key", as_index=False).agg({"val": "sum"})[["val"]]
     actual = gapply(gd, func, schema, "val").toPandas()
     _assertPandasAlmostEqual(actual, expected)
Example #9
0
 def test_gapply_double_key(self):
     schema = StructType().add("val", LongType())
     randKeys = [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)]
     pandasDF = pd.DataFrame.from_dict({
         "key1": randKeys,
         "key2": [GapplyTests.NKEYS + x for x in randKeys],
         "val": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]})
     gd = self.spark.createDataFrame(pandasDF).groupBy("key2", "key1")
     def func(keys, vals):
         assert keys[0] == keys[1] + GapplyTests.NKEYS
         return pd.DataFrame.from_records([(vals["val"].sum(),)])
     expected = pandasDF.groupby(["key2", "key1"], as_index=False).agg({"val": "sum"})
     actual = gapply(gd, func, schema, "val").toPandas()
     _assertPandasAlmostEqual(actual, expected)
Example #10
0
    def checkGapplyEquivalentToPandas(self, pandasAggFunction, dataType,
                                      dataGen):
        schema = StructType().add("val", dataType)
        pandasDF = pd.DataFrame.from_dict({
            "key": [
                random.randrange(GapplyTests.NKEYS)
                for _ in range(GapplyTests.NROWS)
            ],
            "val": [dataGen() for _ in range(GapplyTests.NROWS)]
        })
        gd = self.spark.createDataFrame(pandasDF).groupBy("key")

        def func(key, vals):
            return pd.DataFrame.from_records([
                (pandasAggFunction(vals["val"]), )
            ])

        expected = pandasDF.groupby("key", as_index=False).agg(
            {"val": pandasAggFunction})
        actual = gapply(gd, func, schema, "val").toPandas()
        _assertPandasAlmostEqual(actual, expected)
Example #11
0
    def test_gapply_name_change(self):
        schema = StructType().add("VAL", LongType())
        pandasDF = pd.DataFrame.from_dict({
            "key": [
                random.randrange(GapplyTests.NKEYS)
                for _ in range(GapplyTests.NROWS)
            ],
            "val": [
                random.randrange(GapplyTests.NVALS)
                for _ in range(GapplyTests.NROWS)
            ]
        })
        gd = self.spark.createDataFrame(pandasDF).groupBy("key")

        def func(key, vals):
            return pd.DataFrame.from_records([(vals["val"].sum(), )])

        expected = pandasDF.groupby("key", as_index=False).agg({"val": "sum"})
        expected = expected.rename(columns={"val": "VAL"})
        actual = gapply(gd, func, schema, "val").toPandas()
        _assertPandasAlmostEqual(actual, expected)
Example #12
0
 def test_gapply_empty_schema(self):
     longLongSchema = StructType().add("a", LongType()).add("b", LongType())
     emptyLongLongDF = self.spark.createDataFrame([(1, 2)], schema=longLongSchema)
     gd = emptyLongLongDF.groupBy("a")
     self.assertEqual(gapply(gd, _emptyFunc, StructType(), "b").collect(), [])