ByteGen(), ShortGen(), IntegerGen(), LongGen(), FloatGen(), DoubleGen(), BooleanGen(), DateGen(), TimestampGen() ] #sort locally because of https://github.com/NVIDIA/spark-rapids/issues/84 @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.xfail(condition=not (is_before_spark_310()), reason='https://github.com/NVIDIA/spark-rapids/issues/953') def test_posexplode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: four_op_df( spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a')) #sort locally because of https://github.com/NVIDIA/spark-rapids/issues/84 @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) def test_posexplode_litarray(data_gen): array_lit = gen_scalar( ArrayGen(data_gen, min_length=3, max_length=3, nullable=False)) assert_gpu_and_cpu_are_equal_collect( lambda spark: four_op_df(spark, data_gen).select( f.col('a'), f.col('b'), f.col('c'), f.posexplode(array_lit)))
RepeatSeqGen(LongGen(nullable=(True, 20.0)), length=10)), ('b', IntegerGen()), ('c', LongGen())] _mixed_df2_with_nulls = [('a', RepeatSeqGen(LongGen(nullable=(True, 20.0)), length=10)), ('b', StringGen()), ('c', BooleanGen())] @ignore_order @pytest.mark.parametrize('join_type', [ 'Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', pytest.param( 'FullOuter', marks=pytest.mark.xfail( condition=not (is_before_spark_310()), reason='https://github.com/NVIDIA/spark-rapids/issues/575')), 'Cross' ], ids=idfn) def test_broadcast_join_mixed(join_type): def do_join(spark): left = gen_df(spark, _mixed_df1_with_nulls, length=500) right = gen_df(spark, _mixed_df2_with_nulls, length=500).withColumnRenamed("a", "r_a")\ .withColumnRenamed("b", "r_b").withColumnRenamed("c", "r_c") return left.join(broadcast(right), left.a.eqNullSafe(right.r_a), join_type) assert_gpu_and_cpu_are_equal_collect(do_join)
f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first() ], ids=idfn) def test_single_sort_in_part(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).sortWithinPartitions(order), conf=allow_negative_scale_of_decimal_conf) orderable_gens_sort = [ byte_gen, short_gen, int_gen, long_gen, pytest.param(float_gen, marks=pytest.mark.xfail( condition=is_before_spark_310(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), pytest.param(double_gen, marks=pytest.mark.xfail( condition=is_before_spark_310(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), boolean_gen, timestamp_gen, date_gen, string_gen, null_gen ] + decimal_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, data_gen).orderBy( f.col('a'), f.col('b').desc()),
.coalesce(1).selectExpr( 'min(a)', 'max(a)', 'first(a)', 'last(a)', 'count(a)', 'count(1)'), conf = _no_nans_float_conf) @pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn) def test_distinct_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( 'count(DISTINCT a)')) @pytest.mark.xfail(condition=is_before_spark_310(), reason='Spark fixed distinct count of NaNs in 3.1') @pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn) def test_distinct_float_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( 'count(DISTINCT a)')) @approximate_float @pytest.mark.parametrize('data_gen', numeric_gens, ids=idfn) def test_arithmetic_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( 'sum(a)', 'avg(a)'), conf = _no_nans_float_conf)
LongGen(nullable=False), FloatGen(nullable=False), DoubleGen(nullable=False), BooleanGen(nullable=False), TimestampGen(nullable=False), DateGen(nullable=False), StringGen(nullable=False), DecimalGen(nullable=False), DecimalGen(precision=7, scale=-3, nullable=False), DecimalGen(precision=7, scale=3, nullable=False), DecimalGen(precision=7, scale=7, nullable=False), DecimalGen(precision=12, scale=2, nullable=False)] @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) def test_single_orderby(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order), conf = allow_negative_scale_of_decimal_conf) @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) def test_single_sort_in_part(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).sortWithinPartitions(order), conf = allow_negative_scale_of_decimal_conf) orderable_gens_sort = [byte_gen, short_gen, int_gen, long_gen, pytest.param(float_gen, marks=pytest.mark.xfail(condition=is_before_spark_310(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), pytest.param(double_gen, marks=pytest.mark.xfail(condition=is_before_spark_310(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), boolean_gen, timestamp_gen, date_gen, string_gen, null_gen] + decimal_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()), conf = allow_negative_scale_of_decimal_conf)
all_gen_restricting_dates = [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(), pytest.param(FloatGen(special_cases=[FLOAT_MIN, FLOAT_MAX, 0.0, 1.0, -1.0]), marks=[incompat]), pytest.param(DoubleGen(special_cases=double_special_cases), marks=[incompat]), BooleanGen(), # due to backward compatibility we are avoiding writing dates prior to 1582-10-15 # For more detail please look at SPARK-31404 # This issue is tracked by https://github.com/NVIDIA/spark-rapids/issues/133 in the plugin DateGen(start=date(1582, 10, 15)), TimestampGen()] @pytest.mark.parametrize('ts_rebase', ['CORRECTED', 'LEGACY']) @pytest.mark.parametrize('data_gen', all_gen_restricting_dates, ids=idfn) @pytest.mark.parametrize('ts_write', ['INT96', 'TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']) @pytest.mark.parametrize('enableVectorized', ['true', 'false'], ids=idfn) @pytest.mark.xfail(condition=not(is_before_spark_310()), reason='https://github.com/NVIDIA/spark-rapids/issues/953') @allow_non_gpu('DataWritingCommandExec') def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write, enableVectorized): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU' data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU' def write_posExplode(data_path): def posExplode(spark): cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache() cached.count() cached.write.parquet(data_path) spark.read.parquet(data_path) return posExplode from_cpu = with_cpu_session(write_posExplode(data_path_cpu), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,