def test_hive_simple_udf_native(enable_rapids_udf_example_native): with_spark_session(skip_if_no_hive) def evalfn(spark): load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_native_udf_test_table", "SELECT wordcount(s) FROM hive_native_udf_test_table")
def test_hive_generic_udf(): with_spark_session(skip_if_no_hive) data_gens = [["s", StringGen('.{0,30}')]] def evalfn(spark): load_hive_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.hive.URLEncode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_generic_udf_test_table", "SELECT urlencode(s) FROM hive_generic_udf_test_table")
def test_hive_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", encoded_url_gen]] def evalfn(spark): load_hive_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.hive.URLDecode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_simple_udf_test_table", "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")
def test_hive_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}')]] def evalfn(spark): load_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.URLDecode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_simple_udf_test_table", "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")
def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): load_hive_udf( spark, "emptygeneric", "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveGenericUDF") return gen_df(spark, [["s", string_gen]]) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_generic_udf_test_table", "SELECT emptygeneric(s) FROM hive_generic_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'})
def test_hive_empty_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", string_gen]] def evalfn(spark): load_hive_udf( spark, "emptysimple", "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveSimpleUDF") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_simple_udf_test_table", "SELECT i, emptysimple(s, 'const_string') FROM hive_simple_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'})
lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, ' 'count(*) as count_stars, ' 'count(b) as count_bees, ' 'sum(b) as sum_of_bees, ' 'max(c) as max_seas, ' 'min(c) as min_seas, ' 'count(distinct c) as count_distinct_cees, ' 'avg(c) as average_seas ' 'from hash_agg_table group by a', _no_nans_float_conf) @pytest.mark.xfail( condition=with_spark_session(lambda spark : is_spark_300()), reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate " "(https://github.com/apache/spark/pull/28876) " "Fixed in later Apache Spark releases.") @approximate_float @ignore_order @pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_count_distinct_with_nan_floats(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, count(distinct b) as count_distinct_bees from hash_agg_table group by a', _no_nans_float_conf) # TODO: Literal tests # TODO: First and Last tests
assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.sql( 'select a, ' 'count(*) as count_stars, ' 'count(b) as count_bees, ' 'sum(b) as sum_of_bees, ' 'max(c) as max_seas, ' 'min(c) as min_seas, ' 'count(distinct c) as count_distinct_cees, ' 'avg(c) as average_seas ' 'from hash_agg_table group by a'), conf=_no_nans_float_conf) @pytest.mark.xfail( condition=with_spark_session(lambda spark : spark.sparkContext.version == "3.0.0"), reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate " "(https://github.com/apache/spark/pull/28876) " "Fixed in later Apache Spark releases.") @approximate_float @ignore_order @pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_count_distinct_with_nan_floats(data_gen): df = with_cpu_session( lambda spark : gen_df(spark, data_gen, length=1024)) df.createOrReplaceTempView("hash_agg_table") assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.sql( 'select a, ' 'count(distinct b) as count_distinct_bees ' 'from hash_agg_table group by a'),