def test_missing_column_names_filter(): if is_spark_300(): pytest.skip("Apache Spark 3.0.0 does not handle ORC files without column names") with_cpu_session(setup_orc_file_no_column_names) assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.sql("SELECT _col3,_col2 FROM test_orc_data WHERE _col2 = '155'"))
def test_cache_shuffled_hash_join(data_gen, join_type): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") def do_join(spark): left, right = create_df(spark, data_gen, 50, 500) cached = left.join(right, left.a == right.r_a, join_type).cache() cached.count() return cached assert_gpu_and_cpu_are_equal_collect(do_join)
def test_cache_broadcast_hash_join(data_gen, join_type, enableVectorizedConf): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) cached = left.join(right.hint("broadcast"), left.a == right.r_a, join_type).cache() cached.count() return cached assert_gpu_and_cpu_are_equal_collect(do_join, conf = enableVectorizedConf)
def test_missing_column_names_filter(spark_tmp_table_factory): if is_spark_300(): pytest.skip( "Apache Spark 3.0.0 does not handle ORC files without column names" ) table_name = spark_tmp_table_factory.get() with_cpu_session( lambda spark: setup_orc_file_no_column_names(spark, table_name)) assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql( "SELECT _col3,_col2 FROM {} WHERE _col2 = '155'".format(table_name)))
def test_cached_join_filter(data_gen, join_type): data, filter = data_gen if is_spark_300() and data.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") def do_join(spark): left, right = create_df(spark, data, 500, 500) cached = left.join(right, left.a == right.r_a, join_type).cache() cached.count() #populates the cache return cached.filter(filter) assert_gpu_and_cpu_are_equal_collect(do_join)
def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU' data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU' def write_posExplode(data_path): def posExplode(spark): cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache() cached.count() cached.write.parquet(data_path) spark.read.parquet(data_path) return posExplode from_cpu = with_cpu_session(write_posExplode(data_path_cpu), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) from_gpu = with_gpu_session(write_posExplode(data_path_gpu), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) assert_equal(from_cpu, from_gpu)
def test_cache_posexplode_makearray(spark_tmp_path, data_gen): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU' data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU' def write_posExplode(data_path): def posExplode(spark): cached = four_op_df(spark, data_gen).selectExpr( 'posexplode(array(b, c, d))', 'a').cache() cached.count() cached.write.parquet(data_path) spark.read.parquet(data_path) return posExplode from_cpu = with_cpu_session(write_posExplode(data_path_cpu)) from_gpu = with_gpu_session(write_posExplode(data_path_gpu)) assert_equal(from_cpu, from_gpu)
lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, ' 'count(*) as count_stars, ' 'count(b) as count_bees, ' 'sum(b) as sum_of_bees, ' 'max(c) as max_seas, ' 'min(c) as min_seas, ' 'count(distinct c) as count_distinct_cees, ' 'avg(c) as average_seas ' 'from hash_agg_table group by a', _no_nans_float_conf) @pytest.mark.xfail( condition=with_spark_session(lambda spark : is_spark_300()), reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate " "(https://github.com/apache/spark/pull/28876) " "Fixed in later Apache Spark releases.") @approximate_float @ignore_order @pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) def test_count_distinct_with_nan_floats(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, count(distinct b) as count_distinct_bees from hash_agg_table group by a', _no_nans_float_conf) # TODO: Literal tests # TODO: First and Last tests