def test_unary_positive(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('+a'))
def test_decimal_round(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'round(a)', 'round(a, -1)', 'round(a, 1)', 'round(a, 10)'), conf=allow_negative_scale_of_decimal_conf)
def test_degrees_small(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('degrees(a)'))
def test_coalesce_df(num_parts, length): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens)] assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df( spark, gen_list, length=length).coalesce(num_parts))
def test_initcap_special_chars(): gen = mk_str_gen('ʼn([aAbB13ȺéŸ]{0,5}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a'))))
def test_union_by_missing_col_name(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df( spark, data_gen).withColumnRenamed("a", "x").unionByName( binary_op_df(spark, data_gen).withColumnRenamed("a", "y"), True))
def test_union_by_name(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df( spark, data_gen).unionByName(binary_op_df(spark, data_gen)))
def test_single_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).orderBy(order).limit(100))
def test_single_nested_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).orderBy(order).limit(100), conf={'spark.rapids.allowCpuRangePartitioning': False})
def test_array_cast_fallback(): def cast_float_to_double(spark): df = two_col_df(spark, int_gen, ArrayGen(int_gen)) res = df.select(df.b.cast(ArrayType(StringType()))) return res assert_gpu_and_cpu_are_equal_collect(cast_float_to_double)
def test_array_cast_bad_from_good_to_fallback(child_gen, child_to_type): def cast_array(spark): df = two_col_df(spark, int_gen, ArrayGen(child_gen)) res = df.select(df.b.cast(ArrayType(child_to_type))) return res assert_gpu_and_cpu_are_equal_collect(cast_array)
def test_array_element_at_all_null_ansi_not_fail(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).select(element_at(col('a'), 100)), conf={'spark.sql.ansi.enabled':True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True})
def test_columnar_pow(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, data_gen).selectExpr('pow(a, b)'))
def test_columnar_asinh_improved(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('asinh(a)'), {'spark.rapids.sql.improvedFloatOps.enabled': 'true'})
def test_passing_gpuExpr_as_Expr(enableVectorizedConf): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, string_gen).select(f.col("a")).na. drop().groupBy(f.col("a")).agg(f.count(f.col("a")).alias("count_a")). orderBy(f.col("count_a").desc(), f.col("a")).cache().limit(50), enableVectorizedConf)
def test_single_sort_in_part(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).sortWithinPartitions(order), conf=allow_negative_scale_of_decimal_conf)
def test_union_struct_missing_children(data_gen): left_gen, right_gen = data_gen assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df( spark, left_gen).unionByName(binary_op_df(spark, right_gen), True))
def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, data_gen).orderBy( f.col('a'), f.col('b').desc()), conf=allow_negative_scale_of_decimal_conf)
def assert_union_equal(gen1, gen2): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, gen1).unionByName(unary_op_df(spark, gen2), True))
def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df( spark, data_gen).orderBy(f.col('a'), f.col('b').desc()).limit(100))
def test_coalesce_types(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen).coalesce(2))
def test_orderby_with_processing_and_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( # avoid ambiguity in the order by statement for floating point by including a as a backup ordering column lambda spark: unary_op_df(spark, data_gen).orderBy( f.lit(100) - f.col('a'), f.col('a')).limit(100))
def test_repeat_column_and_column(): gen_s = StringGen(nullable=True) gen_r = IntegerGen(min_val=-100, max_val=100, special_cases=[0], nullable=True) assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, gen_s, gen_r).selectExpr('repeat(a, b)'))
def test_large_orderby(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, long_gen, length=1024*128)\ .orderBy(f.col('a')), conf = {'spark.rapids.sql.batchSizeBytes': '16384'})
def test_ceil(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('ceil(a)'), conf=allow_negative_scale_of_decimal_conf)
def test_single_orderby(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).orderBy(order), conf=allow_negative_scale_of_decimal_conf)
def test_bit_not(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('~a'))
def test_cache_partial_load(data_gen, enableVectorizedConf): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, string_gen).select( f.col("a"), f.col("b")).cache().limit(50).select(f.col("b")), enableVectorizedConf)
def test_cos(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('cos(a)'))
def test_struct_get_item(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).selectExpr('a.first', 'a.second', 'a.third'))