예제 #1
0
def test_map_expr_literal_keys_dupe_exception():
    data_gen = [('a', StringGen(nullable=False)), ('b', StringGen(nullable=False))]
    assert_gpu_and_cpu_error(
            lambda spark : gen_df(spark, data_gen).selectExpr(
                'map("key1", b, "key1", a) as m1').collect(),
                conf={'spark.sql.mapKeyDedupPolicy':'EXCEPTION'},
                error_message = "Duplicate map key")
예제 #2
0
def test_cast_string_to_day_time_interval_exception(invalid_string):
    dtType = DayTimeIntervalType(0, 3)
    def fun(spark):
        data=[invalid_string]
        df = spark.createDataFrame(data, StringType())
        return df.select(f.col('value').cast(dtType)).collect()
    assert_gpu_and_cpu_error(fun, {}, "java.lang.IllegalArgumentException")
예제 #3
0
def test_regexp_extract_idx_negative():
    gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}')
    assert_gpu_and_cpu_error(
            lambda spark: unary_op_df(spark, gen).selectExpr(
                'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", -1)').collect(),
            error_message = "The specified group index cannot be less than zero",
        conf=_regexp_conf)
예제 #4
0
def test_sequence_too_long_sequence(stop_gen):
    assert_gpu_and_cpu_error(
        # To avoid OOM, reduce the row number to 1, it is enough to verify this case.
        lambda spark: unary_op_df(spark, stop_gen, 1).selectExpr(
            "sequence(0, a)").collect(),
        conf={},
        error_message="Too long sequence")
예제 #5
0
def test_simple_get_map_value_ansi_fail(data_gen):
    message = "org.apache.spark.SparkNoSuchElementException" if is_databricks104_or_later() else "java.util.NoSuchElementException"
    assert_gpu_and_cpu_error(
            lambda spark: unary_op_df(spark, data_gen).selectExpr(
                'a["NOT_FOUND"]').collect(),
                conf=ansi_enabled_conf,
                error_message=message)
예제 #6
0
def test_regexp_extract_idx_out_of_bounds():
    gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}')
    assert_gpu_and_cpu_error(
            lambda spark: unary_op_df(spark, gen).selectExpr(
                'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", 4)').collect(),
            error_message = "Regex group count is 3, but the specified group index is 4",
            conf=_regexp_conf)
예제 #7
0
def test_array_element_at_ansi_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).select(
            element_at(col('a'), 100)).collect(),
        conf={
            'spark.sql.ansi.enabled': True,
            'spark.sql.legacy.allowNegativeScaleOfDecimal': True
        },
        error_message='java.lang.ArrayIndexOutOfBoundsException')
예제 #8
0
def test_map_element_at_ansi_fail(data_gen):
    message = "org.apache.spark.SparkNoSuchElementException" if (not is_before_spark_330() or is_databricks104_or_later()) else "java.util.NoSuchElementException"
    # For 3.3.0+ strictIndexOperator should not affect element_at
    test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'})
    assert_gpu_and_cpu_error(
            lambda spark: unary_op_df(spark, data_gen).selectExpr(
                'element_at(a, "NOT_FOUND")').collect(),
                conf=test_conf,
                error_message=message)
예제 #9
0
def test_dateaddinterval_ansi_exception():
    assert_gpu_and_cpu_error(
        # specify the `seconds`
        lambda spark: unary_op_df(
            spark, DateGen(start=date(200, 1, 1), end=date(800, 1, 1)), seed=1)
        .selectExpr('a + (interval {} days {} seconds)'.format(1, 5)).collect(
        ),
        conf=copy_and_update(ansi_enabled_conf, legacy_interval_enabled_conf),
        error_message="IllegalArgumentException")
예제 #10
0
def test_map_element_at_ansi_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).selectExpr(
            'element_at(a, "NOT_FOUND")').collect(),
        conf={
            'spark.sql.ansi.enabled': True,
            'spark.sql.legacy.allowNegativeScaleOfDecimal': True
        },
        error_message='java.util.NoSuchElementException')
예제 #11
0
def test_array_item_ansi_fail_invalid_index(index):
    message = "SparkArrayIndexOutOfBoundsException" if is_databricks104_or_later() else "java.lang.ArrayIndexOutOfBoundsException"
    if isinstance(index, int):
        test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(col('a')[index]).collect()
    else:
        test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr('a[b]').collect()
    assert_gpu_and_cpu_error(
        test_func,
        conf=ansi_enabled_conf,
        error_message=message)
예제 #12
0
def test_div_overflow_exception_when_ansi(expr, ansi_enabled):
    ansi_conf = {'spark.sql.ansi.enabled': ansi_enabled}
    if ansi_enabled == 'true':
        assert_gpu_and_cpu_error(
            df_fun=lambda spark: _get_div_overflow_df(spark, expr).collect(),
            conf=ansi_conf,
            error_message=
            'java.lang.ArithmeticException: Overflow in integral divide')
    else:
        assert_gpu_and_cpu_are_equal_collect(
            func=lambda spark: _get_div_overflow_df(spark, expr),
            conf=ansi_conf)
예제 #13
0
def test_array_element_at_zero_index_fail(index, ansi_enabled):
    message = "SQL array indices start at 1"
    if isinstance(index, int):
        test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(
            element_at(col('a'), index)).collect()
    else:
        test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr(
            'element_at(a, b)').collect()
    assert_gpu_and_cpu_error(
        test_func,
        conf={'spark.sql.ansi.enabled':ansi_enabled},
        error_message=message)
예제 #14
0
def _test_div_by_zero(ansi_mode, expr):
    ansi_conf = {'spark.sql.ansi.enabled': ansi_mode == 'ansi'}
    data_gen = lambda spark: two_col_df(
        spark, IntegerGen(), IntegerGen(min_val=0, max_val=0), length=1)
    div_by_zero_func = lambda spark: data_gen(spark).selectExpr(expr)

    if ansi_mode == 'ansi':
        assert_gpu_and_cpu_error(
            df_fun=lambda spark: div_by_zero_func(spark).collect(),
            conf=ansi_conf,
            error_message='java.lang.ArithmeticException: divide by zero')
    else:
        assert_gpu_and_cpu_are_equal_collect(div_by_zero_func, ansi_conf)
예제 #15
0
def test_simple_get_map_value_with_strict_index(strict_index, data_gen):
    message = "org.apache.spark.SparkNoSuchElementException"
    test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': strict_index})
    if strict_index == 'true':
        assert_gpu_and_cpu_error(
                lambda spark: unary_op_df(spark, data_gen).selectExpr(
                        'a["NOT_FOUND"]').collect(),
                conf=test_conf,
                error_message=message)
    else:
        assert_gpu_and_cpu_are_equal_collect(
                lambda spark: unary_op_df(spark, data_gen).selectExpr(
                        'a["NOT_FOUND"]'),
                conf=test_conf)
예제 #16
0
def test_array_element_at_ansi_fail_invalid_index(index):
    message = "ArrayIndexOutOfBoundsException" if is_before_spark_330() else "SparkArrayIndexOutOfBoundsException"
    if isinstance(index, int):
        test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(
            element_at(col('a'), index)).collect()
    else:
        test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr(
            'element_at(a, b)').collect()
    # For 3.3.0+ strictIndexOperator should not affect element_at
    test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'})
    assert_gpu_and_cpu_error(
        test_func,
        conf=test_conf,
        error_message=message)
예제 #17
0
def test_json_read_invalid_dates(std_input_path, filename, schema, read_func,
                                 ansi_enabled, time_parser_policy,
                                 spark_tmp_table_factory):
    updated_conf = copy_and_update(
        _enable_all_types_conf, {
            'spark.sql.ansi.enabled': ansi_enabled,
            'spark.sql.legacy.timeParserPolicy': time_parser_policy
        })
    f = read_func(std_input_path + '/' + filename, schema,
                  spark_tmp_table_factory, {})
    if time_parser_policy == 'EXCEPTION':
        assert_gpu_and_cpu_error(df_fun=lambda spark: f(spark).collect(),
                                 conf=updated_conf,
                                 error_message='DateTimeException')
    elif time_parser_policy == 'LEGACY' and ansi_enabled == 'true':
        assert_gpu_fallback_collect(f, 'FileSourceScanExec', conf=updated_conf)
    else:
        assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf)
예제 #18
0
def test_array_item_with_strict_index(strict_index_enabled, index):
    message = "SparkArrayIndexOutOfBoundsException"
    if isinstance(index, int):
        test_df = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(col('a')[index])
    else:
        test_df = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr('a[b]')

    test_conf=copy_and_update(
        ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': strict_index_enabled})

    if strict_index_enabled:
        assert_gpu_and_cpu_error(
            lambda spark: test_df(spark).collect(),
            conf=test_conf,
            error_message=message)
    else:
        assert_gpu_and_cpu_are_equal_collect(
            test_df,
            conf=test_conf)
예제 #19
0
def test_logical_with_side_effect(ansi_enabled, lhs_arg, int_arg, logic_op):
    def do_it(spark, lhs_bool_arg, arith_arg, op):
        schema = StructType([
            StructField("a", BooleanType()),
            StructField("b", BooleanType()),
            StructField("c", IntegerType())])
        return spark.createDataFrame(
            [(False, True, arith_arg), (False, True, 1), (False, True, -5)],
            schema=schema
        ).selectExpr('{} {} (c + 2) > 0'.format(lhs_bool_arg, op))
    ansi_conf = {'spark.sql.ansi.enabled': ansi_enabled}
    bypass_map = {'AND': 'a', 'OR': 'b'}
    expect_error = int_arg == INT_MAX and (lhs_arg == 'NULL' or bypass_map[logic_op] != lhs_arg)
    
    if ansi_enabled == 'true' and expect_error:
        assert_gpu_and_cpu_error(
            df_fun=lambda spark: do_it(spark, lhs_arg, int_arg, logic_op).collect(),
            conf=ansi_conf,
            error_message="ArithmeticException")
    else:
        assert_gpu_and_cpu_are_equal_collect(
            func=lambda spark: do_it(spark, lhs_arg, int_arg, logic_op),
            conf=ansi_conf)
예제 #20
0
def test_read_valid_and_invalid_dates(std_input_path, filename,
                                      v1_enabled_list, ansi_enabled,
                                      time_parser_policy):
    data_path = std_input_path + '/' + filename
    updated_conf = copy_and_update(
        _enable_all_types_conf, {
            'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.ansi.enabled': ansi_enabled,
            'spark.sql.legacy.timeParserPolicy': time_parser_policy
        })
    if time_parser_policy == 'EXCEPTION':
        assert_gpu_and_cpu_error(
            lambda spark : spark.read \
                .schema(_date_schema) \
                .csv(data_path)
                .collect(),
            conf=updated_conf,
            error_message='DateTimeException')
    else:
        assert_gpu_and_cpu_are_equal_collect(
            lambda spark : spark.read \
                .schema(_date_schema) \
                .csv(data_path),
            conf=updated_conf)
예제 #21
0
def test_gettimestamp_ansi_exception():
    assert_gpu_and_cpu_error(lambda spark: invalid_date_string_df(
        spark).select(f.to_date(f.col("a"), "yyyy-MM-dd")).collect(),
                             error_message="Exception",
                             conf=ansi_enabled_conf)
예제 #22
0
def test_string_unix_timestamp_ansi_exception():
    assert_gpu_and_cpu_error(lambda spark: invalid_date_string_df(
        spark).select(f.unix_timestamp(f.col('a'), 'yyyy/MM/dd')).collect(),
                             error_message="Exception",
                             conf=ansi_enabled_conf)
예제 #23
0
def test_string_to_unix_timestamp_ansi_exception():
    assert_gpu_and_cpu_error(
        lambda spark: invalid_date_string_df(spark).selectExpr(
            "to_unix_timestamp(a, '{}')".format('yyyy/MM/dd')).collect(),
        error_message="Exception",
        conf=ansi_enabled_conf)
예제 #24
0
def test_transform_keys_null_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).selectExpr(
            'transform_keys(a, (key, value) -> CAST(null as INT))').collect(),
        conf={},
        error_message='Cannot use null as map key')
예제 #25
0
def test_transform_keys_duplicate_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).selectExpr(
            'transform_keys(a, (key, value) -> 1)').collect(),
        conf={},
        error_message='Duplicate map key')
예제 #26
0
def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen):
    assert_gpu_and_cpu_error(
        lambda spark: three_col_df(spark, start_gen, stop_gen, step_gen
                                   ).selectExpr("sequence(a, b, c)").collect(),
        conf={},
        error_message="Illegal sequence boundaries")
예제 #27
0
def test_cast_double_to_timestamp(type):
    def fun(spark):
        data=[float("inf"),float("-inf"),float("nan")]
        df = spark.createDataFrame(data, DoubleType())
        return df.select(f.col('value').cast(TimestampType())).collect()
    assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "java.time.DateTimeException")
예제 #28
0
def test_re_replace_backrefs_idx_out_of_bounds():
    gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}')
    assert_gpu_and_cpu_error(lambda spark: unary_op_df(spark, gen).selectExpr(
        'REGEXP_REPLACE(a, "(T)(E)(S)(T)", "[$5]")').collect(),
        conf=_regexp_conf,
        error_message='')