Example #1
0
    _test_div_by_zero(ansi_mode='nonAnsi', expr=expr)


def _get_div_overflow_df(spark, expr):
    return spark.createDataFrame([(LONG_MIN, -1)], ['a', 'b']).selectExpr(expr)


div_overflow_exprs = [
    'CAST(-9223372036854775808L as LONG) DIV -1', 'a DIV CAST(-1 AS INT)',
    'a DIV b'
]


# Only run this test for Spark v3.2.0 and later to verify IntegralDivide will
# throw exceptions for overflow when ANSI mode is enabled.
@pytest.mark.skipif(is_before_spark_320(),
                    reason='https://github.com/apache/spark/pull/32260')
@pytest.mark.parametrize('expr', div_overflow_exprs)
@pytest.mark.parametrize('ansi_enabled', ['false', 'true'])
def test_div_overflow_exception_when_ansi(expr, ansi_enabled):
    ansi_conf = {'spark.sql.ansi.enabled': ansi_enabled}
    if ansi_enabled == 'true':
        assert_gpu_and_cpu_error(
            df_fun=lambda spark: _get_div_overflow_df(spark, expr).collect(),
            conf=ansi_conf,
            error_message=
            'java.lang.ArithmeticException: Overflow in integral divide')
    else:
        assert_gpu_and_cpu_are_equal_collect(
            func=lambda spark: _get_div_overflow_df(spark, expr),
            conf=ansi_conf)
Example #2
0
            lambda spark : gen_df(spark, gen_list).write.orc(second_data_path))
    third_data_path = spark_tmp_path + '/ORC_DATA/key=2/key2=22'
    with_cpu_session(
            lambda spark : gen_df(spark, gen_list).write.orc(third_data_path))
    data_path = spark_tmp_path + '/ORC_DATA'
    all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : spark.read.orc(data_path),
            conf=all_confs)

# Setup external table by altering column names
def setup_external_table_with_forced_positions(spark, table_name, data_path):
    rename_cols_query = "CREATE EXTERNAL TABLE `{}` (`col10` INT, `_c1` STRING, `col30` DOUBLE) STORED AS orc LOCATION '{}'".format(table_name, data_path)
    spark.sql(rename_cols_query).collect

@pytest.mark.skipif(is_before_spark_320(), reason='ORC forced positional evolution support is added in Spark-3.2')
@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn)
@pytest.mark.parametrize('forced_position', ["true", "false"])
@pytest.mark.parametrize('orc_impl', ["native", "hive"])
def test_orc_forced_position(spark_tmp_path, spark_tmp_table_factory, reader_confs, forced_position, orc_impl):
    orc_gens = [int_gen, string_gen, double_gen]
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + 'ORC_DATA'
    with_cpu_session(lambda spark : gen_df(spark, gen_list).write.orc(data_path))
    table_name = spark_tmp_table_factory.get()
    with_cpu_session(lambda spark : setup_external_table_with_forced_positions(spark, table_name, data_path))

    all_confs = copy_and_update(reader_confs, {
        'orc.force.positional.evolution': forced_position,
        'spark.sql.orc.impl': orc_impl})
    assert_gpu_and_cpu_are_equal_collect(
Example #3
0
                'a like "\\%SystemDrive\\%\\\\\\\\Users%"',
                'a like "_oo"'),
            conf={'spark.sql.parser.escapedStringLiterals': 'true'})
 
def test_regexp_replace():
    gen = mk_str_gen('[abcd]{0,3}')
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: unary_op_df(spark, gen).selectExpr(
                'regexp_replace(a, "a", "A")',
                'regexp_replace(a, "[^xyz]", "A")',
                'regexp_replace(a, "([^x])|([^y])", "A")',
                'regexp_replace(a, "(?:aa)+", "A")',
                'regexp_replace(a, "a|b|c", "A")'),
        conf=_regexp_conf)

@pytest.mark.skipif(is_before_spark_320(), reason='regexp is synonym for RLike starting in Spark 3.2.0')
def test_regexp():
    gen = mk_str_gen('[abcd]{1,3}')
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: unary_op_df(spark, gen).selectExpr(
                'regexp(a, "a{2}")',
                'regexp(a, "a{1,3}")',
                'regexp(a, "a{1,}")',
                'regexp(a, "a[bc]d")'),
        conf=_regexp_conf)

@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0')
def test_regexp_like():
    gen = mk_str_gen('[abcd]{1,3}')
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: unary_op_df(spark, gen).selectExpr(
Example #4
0
_grouping_set_gen = [('a', StringGen()), ('b', StringGen())]

_grouping_set_sqls = [
    'SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b))',
    'SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ()))',
    'SELECT a, b, count(1) FROM testData '
    'GROUP BY a, GROUPING SETS((a, b), GROUPING SETS(ROLLUP(a, b)))',
    'SELECT a, b, count(1) FROM testData '
    'GROUP BY a, GROUPING SETS((a, b, a, b), (a, b, a), (a, b))',
    'SELECT a, b, count(1) FROM testData GROUP BY a, '
    'GROUPING SETS(GROUPING SETS((a, b, a, b), (a, b, a), (a, b)))',
    'SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b), CUBE(a, b))',
    'SELECT a, b, count(1) FROM testData '
    'GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ()), '
    'GROUPING SETS((a, b), (a), (b), ()))',
    'SELECT a, b, count(1) FROM testData '
    'GROUP BY a, GROUPING SETS((a, b), (a), (), (a, b), (a), (b), ())',
]


# test nested syntax of grouping set, rollup and cube
@ignore_order
@pytest.mark.parametrize('data_gen', [_grouping_set_gen], ids=idfn)
@pytest.mark.parametrize('sql', _grouping_set_sqls, ids=idfn)
@pytest.mark.skipif(
    is_before_spark_320(),
    reason='Nested grouping sets is not supported before spark 3.2.0')
def test_nested_grouping_sets_rollup_cube(data_gen, sql):
    assert_gpu_and_cpu_are_equal_sql(
        lambda spark: gen_df(spark, data_gen, length=2048), "testData", sql)
Example #5
0
    # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format.
    # This provides values that are valid in all of those formats.
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())),
            conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'})

def test_cast_string_ts_valid_format():
    # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format.
    # This provides values that are valid in all of those formats.
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(TimestampType())),
            conf = {'spark.rapids.sql.hasExtendedYearValues': 'false',
                'spark.rapids.sql.castStringToTimestamp.enabled': 'true'})

@allow_non_gpu('ProjectExec', 'Cast', 'Alias')
@pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years")
def test_cast_string_date_fallback():
    assert_gpu_fallback_collect(
            # Cast back to String because this goes beyond what python can support for years
            lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(DateType()).cast(StringType())),
            'Cast')

@allow_non_gpu('ProjectExec', 'Cast', 'Alias')
@pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years")
def test_cast_string_timestamp_fallback():
    assert_gpu_fallback_collect(
            # Cast back to String because this goes beyond what python can support for years
            lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(TimestampType()).cast(StringType())),
            'Cast',
            conf = {'spark.rapids.sql.castStringToTimestamp.enabled': 'true'})