Esempio n. 1
0
    ByteGen(),
    ShortGen(),
    IntegerGen(),
    LongGen(),
    FloatGen(),
    DoubleGen(),
    BooleanGen(),
    DateGen(),
    TimestampGen()
]


#sort locally because of https://github.com/NVIDIA/spark-rapids/issues/84
@ignore_order(local=True)
@pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
@pytest.mark.xfail(condition=not (is_before_spark_310()),
                   reason='https://github.com/NVIDIA/spark-rapids/issues/953')
def test_posexplode_makearray(data_gen):
    assert_gpu_and_cpu_are_equal_collect(lambda spark: four_op_df(
        spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a'))


#sort locally because of https://github.com/NVIDIA/spark-rapids/issues/84
@ignore_order(local=True)
@pytest.mark.parametrize('data_gen', all_gen, ids=idfn)
def test_posexplode_litarray(data_gen):
    array_lit = gen_scalar(
        ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: four_op_df(spark, data_gen).select(
            f.col('a'), f.col('b'), f.col('c'), f.posexplode(array_lit)))
Esempio n. 2
0
                          RepeatSeqGen(LongGen(nullable=(True, 20.0)),
                                       length=10)), ('b', IntegerGen()),
                         ('c', LongGen())]
_mixed_df2_with_nulls = [('a',
                          RepeatSeqGen(LongGen(nullable=(True, 20.0)),
                                       length=10)), ('b', StringGen()),
                         ('c', BooleanGen())]


@ignore_order
@pytest.mark.parametrize('join_type', [
    'Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti',
    pytest.param(
        'FullOuter',
        marks=pytest.mark.xfail(
            condition=not (is_before_spark_310()),
            reason='https://github.com/NVIDIA/spark-rapids/issues/575')),
    'Cross'
],
                         ids=idfn)
def test_broadcast_join_mixed(join_type):
    def do_join(spark):
        left = gen_df(spark, _mixed_df1_with_nulls, length=500)
        right = gen_df(spark, _mixed_df2_with_nulls, length=500).withColumnRenamed("a", "r_a")\
                .withColumnRenamed("b", "r_b").withColumnRenamed("c", "r_c")
        return left.join(broadcast(right), left.a.eqNullSafe(right.r_a),
                         join_type)

    assert_gpu_and_cpu_are_equal_collect(do_join)

Esempio n. 3
0
    f.col('a').asc_nulls_last(),
    f.col('a').desc(),
    f.col('a').desc_nulls_first()
],
                         ids=idfn)
def test_single_sort_in_part(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen).sortWithinPartitions(order),
        conf=allow_negative_scale_of_decimal_conf)


orderable_gens_sort = [
    byte_gen, short_gen, int_gen, long_gen,
    pytest.param(float_gen,
                 marks=pytest.mark.xfail(
                     condition=is_before_spark_310(),
                     reason='Spark has -0.0 < 0.0 before Spark 3.1')),
    pytest.param(double_gen,
                 marks=pytest.mark.xfail(
                     condition=is_before_spark_310(),
                     reason='Spark has -0.0 < 0.0 before Spark 3.1')),
    boolean_gen, timestamp_gen, date_gen, string_gen, null_gen
] + decimal_gens


@pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn)
def test_multi_orderby(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).orderBy(
            f.col('a'),
            f.col('b').desc()),
Esempio n. 4
0
                    .coalesce(1).selectExpr(
                'min(a)',
                'max(a)',
                'first(a)',
                'last(a)',
                'count(a)',
                'count(1)'),
            conf = _no_nans_float_conf)

@pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn)
def test_distinct_count_reductions(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : binary_op_df(spark, data_gen).selectExpr(
                'count(DISTINCT a)'))

@pytest.mark.xfail(condition=is_before_spark_310(),
        reason='Spark fixed distinct count of NaNs in 3.1')
@pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn)
def test_distinct_float_count_reductions(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : binary_op_df(spark, data_gen).selectExpr(
                'count(DISTINCT a)'))

@approximate_float
@pytest.mark.parametrize('data_gen', numeric_gens, ids=idfn)
def test_arithmetic_reductions(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, data_gen).selectExpr(
                'sum(a)',
                'avg(a)'),
            conf = _no_nans_float_conf)
Esempio n. 5
0
        LongGen(nullable=False), FloatGen(nullable=False), DoubleGen(nullable=False), BooleanGen(nullable=False),
        TimestampGen(nullable=False), DateGen(nullable=False), StringGen(nullable=False), DecimalGen(nullable=False),
        DecimalGen(precision=7, scale=-3, nullable=False), DecimalGen(precision=7, scale=3, nullable=False),
        DecimalGen(precision=7, scale=7, nullable=False), DecimalGen(precision=12, scale=2, nullable=False)]

@pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn)
@pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn)
def test_single_orderby(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, data_gen).orderBy(order),
            conf = allow_negative_scale_of_decimal_conf)

@pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn)
@pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn)
def test_single_sort_in_part(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, data_gen).sortWithinPartitions(order),
            conf = allow_negative_scale_of_decimal_conf)

orderable_gens_sort = [byte_gen, short_gen, int_gen, long_gen,
        pytest.param(float_gen, marks=pytest.mark.xfail(condition=is_before_spark_310(),
            reason='Spark has -0.0 < 0.0 before Spark 3.1')),
        pytest.param(double_gen, marks=pytest.mark.xfail(condition=is_before_spark_310(),
            reason='Spark has -0.0 < 0.0 before Spark 3.1')),
        boolean_gen, timestamp_gen, date_gen, string_gen, null_gen] + decimal_gens
@pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn)
def test_multi_orderby(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()),
            conf = allow_negative_scale_of_decimal_conf)
Esempio n. 6
0
all_gen_restricting_dates = [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(),
           pytest.param(FloatGen(special_cases=[FLOAT_MIN, FLOAT_MAX, 0.0, 1.0, -1.0]), marks=[incompat]),
           pytest.param(DoubleGen(special_cases=double_special_cases), marks=[incompat]),
           BooleanGen(),
           # due to backward compatibility we are avoiding writing dates prior to 1582-10-15
           # For more detail please look at SPARK-31404
           # This issue is tracked by https://github.com/NVIDIA/spark-rapids/issues/133 in the plugin
           DateGen(start=date(1582, 10, 15)),
           TimestampGen()]

@pytest.mark.parametrize('ts_rebase', ['CORRECTED', 'LEGACY'])
@pytest.mark.parametrize('data_gen', all_gen_restricting_dates, ids=idfn)
@pytest.mark.parametrize('ts_write', ['INT96', 'TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS'])
@pytest.mark.parametrize('enableVectorized', ['true', 'false'], ids=idfn)
@pytest.mark.xfail(condition=not(is_before_spark_310()), reason='https://github.com/NVIDIA/spark-rapids/issues/953')
@allow_non_gpu('DataWritingCommandExec')
def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write, enableVectorized):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")
    data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU'
    data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU'
    def write_posExplode(data_path):
        def posExplode(spark):
            cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache()
            cached.count()
            cached.write.parquet(data_path)
            spark.read.parquet(data_path)
        return posExplode
    from_cpu = with_cpu_session(write_posExplode(data_path_cpu),
                  conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,