def test_window_aggs_for_ranges(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' sum(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between interval 1 day preceding and interval 1 day following) as sum_c_asc, ' ' avg(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between interval 1 day preceding and interval 1 day following) as avg_c_asc, ' ' max(c) over ' ' (partition by a order by cast(b as timestamp) desc ' ' range between interval 2 days preceding and interval 1 days following) as max_c_desc, ' ' min(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between interval 2 days preceding and current row) as min_c_asc, ' ' count(1) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between CURRENT ROW and UNBOUNDED following) as count_1_asc, ' ' count(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between CURRENT ROW and UNBOUNDED following) as count_c_asc, ' ' avg(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between UNBOUNDED preceding and CURRENT ROW) as avg_c_unbounded, ' ' sum(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between UNBOUNDED preceding and CURRENT ROW) as sum_c_unbounded, ' ' max(c) over ' ' (partition by a order by cast(b as timestamp) asc ' ' range between UNBOUNDED preceding and UNBOUNDED following) as max_c_unbounded ' 'from window_agg_table', conf={'spark.rapids.sql.castFloatToDecimal.enabled': True})
def test_window_aggs_for_rows_collect_list(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_list), "window_collect_table", ''' select collect_list(c_bool) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_bool, collect_list(c_short) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_short, collect_list(c_int) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_int, collect_list(c_long) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_long, collect_list(c_date) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_date, collect_list(c_ts) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_ts, collect_list(c_byte) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_byte, collect_list(c_string) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_string, collect_list(c_float) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_float, collect_list(c_double) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_double, collect_list(c_decimal) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_decimal, collect_list(c_struct) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_struct, collect_list(c_array) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_array, collect_list(c_map) over (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_map from window_collect_table ''')
def test_running_window_function_exec_for_all_aggs(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_list), "window_collect_table", ''' select sum(c_int) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as sum_int, min(c_long) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as min_long, max(c_date) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as max_date, count(1) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_1, count(*) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_star, row_number() over (partition by a order by b,c_int) as row_num, rank() over (partition by a order by b,c_int) as rank_val, dense_rank() over (partition by a order by b,c_int) as dense_rank_val, collect_list(c_float) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_float, collect_list(c_decimal) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_decimal, collect_list(c_struct) over (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_struct from window_collect_table ''')
def test_window_running(b_gen, c_gen, batch_size): conf = { 'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.hasNans': False, 'spark.rapids.sql.variableFloatAgg.enabled': True, 'spark.rapids.sql.castFloatToDecimal.enabled': True } query_parts = [ 'b', 'a', 'row_number() over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as row_num', 'rank() over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as rank_val', 'dense_rank() over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dense_rank_val', 'count(c) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_col', 'min(c) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as min_col', 'max(c) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as max_col' ] # Decimal precision can grow too large. Float and Double can get odd results for Inf/-Inf because of ordering if isinstance(c_gen.data_type, NumericType) and (not isinstance( c_gen, FloatGen)) and (not isinstance( c_gen, DoubleGen)) and (not isinstance(c_gen, DecimalGen)): query_parts.append( 'sum(c) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as sum_col' ) assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, LongRangeGen(), RepeatSeqGen(b_gen, length=100), c_gen, length=1024 * 14), "window_agg_table", 'select ' + ', '.join(query_parts) + ' from window_agg_table ', validate_execs_in_gpu_plan=['GpuRunningWindowExec'], conf=conf)
def test_window_running_float_decimal_sum(batch_size): conf = { 'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.variableFloatAgg.enabled': True, 'spark.rapids.sql.castFloatToDecimal.enabled': True } query_parts = [ 'b', 'a', 'sum(cast(c as double)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dbl_sum', 'sum(abs(dbl)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dbl_sum', 'sum(cast(c as float)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as flt_sum', 'sum(abs(flt)) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as flt_sum', 'sum(cast(c as Decimal(6,1))) over (partition by b order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dec_sum' ] gen = StructGen([('a', LongRangeGen()), ('b', RepeatSeqGen(int_gen, length=1000)), ('c', short_gen), ('flt', float_gen), ('dbl', double_gen)], nullable=False) assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, gen, length=1024 * 14), "window_agg_table", 'select ' + ', '.join(query_parts) + ' from window_agg_table ', validate_execs_in_gpu_plan=['GpuRunningWindowExec'], conf=conf)
def test_window_running_no_part(b_gen, batch_size): conf = { 'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.hasNans': False, 'spark.rapids.sql.castFloatToDecimal.enabled': True } query_parts = [ 'row_number() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as row_num', 'rank() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as rank_val', 'dense_rank() over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as dense_rank_val', 'count(b) over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_col', 'min(b) over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as min_col', 'max(b) over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as max_col' ] if isinstance(b_gen.data_type, NumericType) and not isinstance( b_gen, FloatGen) and not isinstance(b_gen, DoubleGen): query_parts.append( 'sum(b) over (order by a rows between UNBOUNDED PRECEDING AND CURRENT ROW) as sum_col' ) assert_gpu_and_cpu_are_equal_sql( lambda spark: two_col_df( spark, LongRangeGen(), b_gen, length=1024 * 14), "window_agg_table", 'select ' + ', '.join(query_parts) + ' from window_agg_table ', validate_execs_in_gpu_plan=['GpuRunningWindowExec'], conf=conf)
def test_window_aggs_for_rows(data_gen, batch_size): conf = { 'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True } assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' sum(c) over ' ' (partition by a order by b,c asc rows between 1 preceding and 1 following) as sum_c_asc, ' ' max(c) over ' ' (partition by a order by b desc, c desc rows between 2 preceding and 1 following) as max_c_desc, ' ' min(c) over ' ' (partition by a order by b,c rows between 2 preceding and current row) as min_c_asc, ' ' count(1) over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and UNBOUNDED following) as count_1, ' ' count(c) over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and UNBOUNDED following) as count_c, ' ' avg(c) over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and UNBOUNDED following) as avg_c, ' ' rank() over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and CURRENT ROW) as rank_val, ' ' dense_rank() over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and CURRENT ROW) as dense_rank_val, ' ' row_number() over ' ' (partition by a order by b,c rows between UNBOUNDED preceding and CURRENT ROW) as row_num ' 'from window_agg_table ', conf=conf)
def test_hash_multiple_filters(data_gen, conf): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=100), "hash_agg_table", 'select count(a) filter (where c > 50),' + 'count(b) filter (where c > 100),' + 'avg(b) filter (where b > 20),' + 'min(a), max(b) filter (where c > 250) from hash_agg_table group by a', conf)
def test_orderby_array_unique(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: append_unique_int_col_to_df( spark, unary_op_df(spark, data_gen)), 'array_table', 'select array_table.a, array_table.uniq_int from array_table order by uniq_int', conf=allow_negative_scale_of_decimal_conf)
def test_window_ride_along(ride_along): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, [('a', LongRangeGen()), ('b', ride_along)]), "window_agg_table", 'select *,' ' row_number() over (order by a) as row_num ' 'from window_agg_table ', conf=allow_negative_scale_of_decimal_conf)
def test_window_aggs_for_ranges_of_dates(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' sum(c) over ' ' (partition by a order by b asc ' ' range between 1 preceding and 1 following) as sum_c_asc ' 'from window_agg_table')
def test_hash_query_multiple_distincts_with_non_distinct( data_gen, conf, parameterless): conf.update({'spark.sql.legacy.allowParameterlessCount': parameterless}) assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=100), "hash_agg_table", 'select avg(a),' + 'avg(distinct b),' + 'avg(distinct c),' + 'sum(distinct a),' + 'count(distinct b),' + 'count(a),' + 'count(),' + 'sum(a),' + 'min(a),' + 'max(a) from hash_agg_table group by a', conf)
def test_scalar_subquery_basics(data_gen): # Fix num_slices at 1 to make sure that first/last returns same results under CPU and GPU. assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, [('a', data_gen)], num_slices=1), 'table', '''select a, (select last(a) from table) from table where a > (select first(a) from table) ''')
def test_parquet_push_down_on_interval_type(spark_tmp_path): gen_list = [('_c1', DayTimeIntervalGen())] data_path = spark_tmp_path + '/PARQUET_DATA' with_cpu_session(lambda spark: gen_df(spark, gen_list).coalesce(1).write. parquet(data_path)) assert_gpu_and_cpu_are_equal_sql( lambda spark: spark.read.parquet(data_path), "testData", "select * from testData where _c1 > interval '10 0:0:0' day to second")
def test_window_aggs_for_rows_count_non_null(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' count(c) over ' ' (partition by a order by b,c ' ' rows between UNBOUNDED preceding and UNBOUNDED following) as count_non_null ' 'from window_agg_table ')
def test_hash_query_max_with_multiple_distincts(data_gen, conf): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=100), "hash_agg_table", 'select max(c),' + 'sum(distinct a),' + 'count(distinct b) from hash_agg_table group by a', conf)
def test_window_aggs_for_ranges_numeric_long_overflow(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' sum(b) over ' ' (partition by a order by b asc ' ' range between 9223372036854775807 preceding and 9223372036854775807 following) as sum_c_asc, ' 'from window_agg_table')
def test_scalar_subquery(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), 'table', ''' select l, i, f, (select count(s) from table) as c from table where l > (select max(i) from table) or f < (select min(i) from table) ''')
def test_window_aggs_for_ranges_numeric_short_overflow(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", 'select ' ' sum(b) over ' ' (partition by a order by b asc ' ' range between 32767 preceding and 32767 following) as sum_c_asc, ' 'from window_agg_table', conf={'spark.rapids.sql.window.range.short.enabled': True})
def test_hive_simple_udf_native(enable_rapids_udf_example_native): with_spark_session(skip_if_no_hive) def evalfn(spark): load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_native_udf_test_table", "SELECT wordcount(s) FROM hive_native_udf_test_table")
def test_concat_ws_sql_arrays_all_null_col_sep(): gen = ArrayGen(StringGen(nullable=True), nullable=True) sep = NullGen() assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, gen, StringGen(nullable=True), sep), 'concat_ws_table', 'select ' + 'concat_ws(c, array(null, null)), ' + 'concat_ws(c, a, array(null), b, array()), ' + 'concat_ws(c, b, b, array(b)) from concat_ws_table')
def test_hive_generic_udf(): with_spark_session(skip_if_no_hive) data_gens = [["s", StringGen('.{0,30}')]] def evalfn(spark): load_hive_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.hive.URLEncode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_generic_udf_test_table", "SELECT urlencode(s) FROM hive_generic_udf_test_table")
def test_hive_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", encoded_url_gen]] def evalfn(spark): load_hive_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.hive.URLDecode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_simple_udf_test_table", "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")
def test_hash_agg_with_nan_keys(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, ' 'count(*) as count_stars, ' 'count(b) as count_bees, ' 'sum(b) as sum_of_bees, ' 'max(c) as max_seas, ' 'min(c) as min_seas, ' 'count(distinct c) as count_distinct_cees, ' 'avg(c) as average_seas ' 'from hash_agg_table group by a', _no_nans_float_conf)
def test_concat_ws_sql_arrays_col_sep(): gen = ArrayGen(StringGen(nullable=True), nullable=True) sep = StringGen('[-,*,+,!]', nullable=True) assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, gen, StringGen(nullable=True), sep), 'concat_ws_table', 'select ' + 'concat_ws(c, array()) as emptyCon, ' + 'concat_ws(c, b, b, array(b)), ' + 'concat_ws(c, a, array(null, c), b, array()), ' + 'concat_ws(c, array(null, null)), ' + 'concat_ws(c, a, array(null), b, array()), ' + 'concat_ws(c, array(\'2\', \'\', \'3\', \'Z\', b)) from concat_ws_table')
def test_concat_ws_sql_col_sep(): gen = StringGen(nullable=True) sep = StringGen('[-,*,+,!]', nullable=True) assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, gen, gen, sep), 'concat_ws_table', 'select ' + 'concat_ws(c, a), ' + 'concat_ws(c, a, b), ' + 'concat_ws(c, null, null), ' + 'concat_ws(c, \'aaa\', \'bbb\', \'zzz\'), ' + 'concat_ws(c, b, \'\', \'bbb\', \'zzz\'), ' + 'concat_ws(c, b, a, cast(null as string)) from concat_ws_table')
def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): load_hive_udf( spark, "emptygeneric", "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveGenericUDF") return gen_df(spark, [["s", string_gen]]) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_generic_udf_test_table", "SELECT emptygeneric(s) FROM hive_generic_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'})
def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): data_gen = [('a', RepeatSeqGen(a_gen, length=20)), ('b', b_gen), ('c', c_gen), ('d', d_gen), ('d_default', d_gen)] assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), "window_agg_table", ''' SELECT LEAD(d, 5) OVER (PARTITION by a ORDER BY b,c) lead_d_5, LEAD(d, 2, d_default) OVER (PARTITION by a ORDER BY b,c) lead_d_2_default, LAG(d, 5) OVER (PARTITION by a ORDER BY b,c) lag_d_5, LAG(d, 2, d_default) OVER (PARTITION by a ORDER BY b,c) lag_d_2_default FROM window_agg_table ''')
def test_concat_ws_sql_arrays(): gen = ArrayGen(StringGen(nullable=True), nullable=True) assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, gen, gen, StringGen(nullable=True)), 'concat_ws_table', 'select ' + 'concat_ws("-", array()), ' + 'concat_ws(null, c, c, array(c)), ' + 'concat_ws("-", array(), c), ' + 'concat_ws("-", a, b), ' + 'concat_ws("-", a, array(null, c), b, array()), ' + 'concat_ws("-", array(null, null)), ' + 'concat_ws("-", a, array(null), b, array()), ' + 'concat_ws("*", array(\'2\', \'\', \'3\', \'Z\', c)) from concat_ws_table')
def test_hive_simple_udf(): with_spark_session(skip_if_no_hive) data_gens = [["i", int_gen], ["s", StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}')]] def evalfn(spark): load_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.URLDecode") return gen_df(spark, data_gens) assert_gpu_and_cpu_are_equal_sql( evalfn, "hive_simple_udf_test_table", "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")