Beispiel #1
0
def test_hive_simple_udf_native(enable_rapids_udf_example_native):
    with_spark_session(skip_if_no_hive)
    def evalfn(spark):
        load_hive_udf_or_skip_test(spark, "wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount")
        return gen_df(spark, data_gens)
    assert_gpu_and_cpu_are_equal_sql(
        evalfn,
        "hive_native_udf_test_table",
        "SELECT wordcount(s) FROM hive_native_udf_test_table")
Beispiel #2
0
def test_hive_generic_udf():
    with_spark_session(skip_if_no_hive)
    data_gens = [["s", StringGen('.{0,30}')]]
    def evalfn(spark):
        load_hive_udf_or_skip_test(spark, "urlencode", "com.nvidia.spark.rapids.udf.hive.URLEncode")
        return gen_df(spark, data_gens)
    assert_gpu_and_cpu_are_equal_sql(
        evalfn,
        "hive_generic_udf_test_table",
        "SELECT urlencode(s) FROM hive_generic_udf_test_table")
Beispiel #3
0
def test_hive_simple_udf():
    with_spark_session(skip_if_no_hive)
    data_gens = [["i", int_gen], ["s", encoded_url_gen]]
    def evalfn(spark):
        load_hive_udf_or_skip_test(spark, "urldecode", "com.nvidia.spark.rapids.udf.hive.URLDecode")
        return gen_df(spark, data_gens)
    assert_gpu_and_cpu_are_equal_sql(
        evalfn,
        "hive_simple_udf_test_table",
        "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")
def test_hive_simple_udf():
    with_spark_session(skip_if_no_hive)
    data_gens = [["i", int_gen],
                 ["s",
                  StringGen('([^%]{0,1}(%[0-9A-F][0-9A-F]){0,1}){0,30}')]]

    def evalfn(spark):
        load_udf_or_skip_test(spark, "urldecode",
                              "com.nvidia.spark.rapids.udf.URLDecode")
        return gen_df(spark, data_gens)

    assert_gpu_and_cpu_are_equal_sql(
        evalfn, "hive_simple_udf_test_table",
        "SELECT i, urldecode(s) FROM hive_simple_udf_test_table")
Beispiel #5
0
def test_hive_empty_generic_udf():
    with_spark_session(skip_if_no_hive)

    def evalfn(spark):
        load_hive_udf(
            spark, "emptygeneric",
            "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveGenericUDF")
        return gen_df(spark, [["s", string_gen]])

    assert_gpu_and_cpu_are_equal_sql(
        evalfn,
        "hive_generic_udf_test_table",
        "SELECT emptygeneric(s) FROM hive_generic_udf_test_table",
        conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'})
Beispiel #6
0
def test_hive_empty_simple_udf():
    with_spark_session(skip_if_no_hive)
    data_gens = [["i", int_gen], ["s", string_gen]]

    def evalfn(spark):
        load_hive_udf(
            spark, "emptysimple",
            "com.nvidia.spark.rapids.tests.udf.hive.EmptyHiveSimpleUDF")
        return gen_df(spark, data_gens)

    assert_gpu_and_cpu_are_equal_sql(
        evalfn,
        "hive_simple_udf_test_table",
        "SELECT i, emptysimple(s, 'const_string') FROM hive_simple_udf_test_table",
        conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'})
Beispiel #7
0
        lambda spark : gen_df(spark, data_gen, length=1024),
        "hash_agg_table",
        'select a, '
        'count(*) as count_stars, ' 
        'count(b) as count_bees, '
        'sum(b) as sum_of_bees, '
        'max(c) as max_seas, '
        'min(c) as min_seas, '
        'count(distinct c) as count_distinct_cees, '
        'avg(c) as average_seas '
        'from hash_agg_table group by a',
        _no_nans_float_conf)


@pytest.mark.xfail(
    condition=with_spark_session(lambda spark : is_spark_300()),
    reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate "
           "(https://github.com/apache/spark/pull/28876) "
           "Fixed in later Apache Spark releases.")
@approximate_float
@ignore_order
@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_count_distinct_with_nan_floats(data_gen):
    assert_gpu_and_cpu_are_equal_sql(
        lambda spark : gen_df(spark, data_gen, length=1024),
        "hash_agg_table",
        'select a, count(distinct b) as count_distinct_bees from hash_agg_table group by a',
        _no_nans_float_conf)

# TODO: Literal tests
# TODO: First and Last tests
Beispiel #8
0
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.sql(
            'select a, '
                   'count(*) as count_stars, ' 
                   'count(b) as count_bees, '
                   'sum(b) as sum_of_bees, '
                   'max(c) as max_seas, '
                   'min(c) as min_seas, '
                   'count(distinct c) as count_distinct_cees, '
                   'avg(c) as average_seas '
            'from hash_agg_table group by a'),
        conf=_no_nans_float_conf)


@pytest.mark.xfail(
    condition=with_spark_session(lambda spark : spark.sparkContext.version == "3.0.0"),
    reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate "
           "(https://github.com/apache/spark/pull/28876) "
           "Fixed in later Apache Spark releases.")
@approximate_float
@ignore_order
@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_count_distinct_with_nan_floats(data_gen):
    df = with_cpu_session(
        lambda spark : gen_df(spark, data_gen, length=1024))
    df.createOrReplaceTempView("hash_agg_table")
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.sql(
            'select a, '
                   'count(distinct b) as count_distinct_bees '
            'from hash_agg_table group by a'),