Beispiel #1
0
def test_round_trip_for_interval(spark_tmp_path, v1_enabled_list):
    csv_interval_gens = [
        DayTimeIntervalGen(start_field="day", end_field="day"),
        DayTimeIntervalGen(start_field="day", end_field="hour"),
        DayTimeIntervalGen(start_field="day", end_field="minute"),
        DayTimeIntervalGen(start_field="day", end_field="second"),
        DayTimeIntervalGen(start_field="hour", end_field="hour"),
        DayTimeIntervalGen(start_field="hour", end_field="minute"),
        DayTimeIntervalGen(start_field="hour", end_field="second"),
        DayTimeIntervalGen(start_field="minute", end_field="minute"),
        DayTimeIntervalGen(start_field="minute", end_field="second"),
        DayTimeIntervalGen(start_field="second", end_field="second"),
    ]

    gen = StructGen([('_c' + str(i), csv_interval_gens[i])
                     for i in range(0, len(csv_interval_gens))],
                    nullable=False)
    data_path = spark_tmp_path + '/CSV_DATA'
    schema = gen.data_type
    updated_conf = copy_and_update(
        _enable_all_types_conf,
        {'spark.sql.sources.useV1SourceList': v1_enabled_list})
    with_cpu_session(lambda spark: gen_df(spark, gen).write.csv(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.schema(schema).csv(data_path),
        conf=updated_conf)
Beispiel #2
0
def test_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list,
                                 ansi_enabled, time_parser_policy):
    gen = StructGen([('a', DateGen())], nullable=False)
    data_path = spark_tmp_path + '/CSV_DATA'
    schema = gen.data_type
    updated_conf = copy_and_update(
        _enable_all_types_conf, {
            'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.ansi.enabled': ansi_enabled,
            'spark.sql.legacy.timeParserPolicy': time_parser_policy
        })
    with_cpu_session(
            lambda spark : gen_df(spark, gen).write\
                    .option('dateFormat', date_format)\
                    .csv(data_path))
    if time_parser_policy == 'LEGACY':
        expected_class = 'FileSourceScanExec'
        if v1_enabled_list == '':
            expected_class = 'BatchScanExec'
        assert_gpu_fallback_collect(
            lambda spark : spark.read \
                .schema(schema) \
                .option('dateFormat', date_format) \
                .csv(data_path),
            expected_class,
            conf=updated_conf)
    else:
        assert_gpu_and_cpu_are_equal_collect(
            lambda spark : spark.read\
                    .schema(schema)\
                    .option('dateFormat', date_format)\
                    .csv(data_path),
            conf=updated_conf)
Beispiel #3
0
def test_compress_read_round_trip(spark_tmp_path, compress):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
        lambda spark: binary_op_df(spark, long_gen).write.parquet(data_path),
        conf={'spark.sql.parquet.compression.codec': compress})
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path))
Beispiel #4
0
def _assert_gpu_and_cpu_writes_are_equal(write_func,
                                         read_func,
                                         base_path,
                                         mode,
                                         conf={}):
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    cpu_path = base_path + '/CPU'
    with_cpu_session(lambda spark: write_func(spark, cpu_path), conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    gpu_start = time.time()
    gpu_path = base_path + '/GPU'
    with_gpu_session(lambda spark: write_func(spark, gpu_path), conf=conf)
    gpu_end = time.time()
    print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
        gpu_end - gpu_start, cpu_end - cpu_start))

    (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare(
        lambda spark: read_func(spark, cpu_path), mode)
    (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare(
        lambda spark: read_func(spark, gpu_path), mode)

    from_cpu = with_cpu_session(cpu_bring_back, conf=conf)
    from_gpu = with_cpu_session(gpu_bring_back, conf=conf)
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Beispiel #5
0
def test_orc_scan_with_aggregate_pushdown_on_col_partition(spark_tmp_path, aggregate):
    """
    Spark(330,_) allows aggregate pushdown on ORC by enabling spark.sql.orc.aggregatePushdown.
    Note that Min/Max don't push down partition column. Only Count does.
    This test checks that GPU falls back to CPU when aggregates are pushed down on ORC.
    When the spark configuration is enabled we check the following:
    ----------------------------------------------+
    | Aggregate | Partition Column | FallBack CPU |
    +-----------+------------------+--------------+
    |   COUNT   |        Y         |      Y       |
    """
    data_path = spark_tmp_path + '/ORC_DATA/pushdown_01.orc'
    # GPU ORC write with statistics is not correctly working.
    # Create ORC file in CPU session as a workaround
    # Partition column P
    with_cpu_session(lambda spark: spark.range(10).selectExpr("id", "id % 3 as p").write
                                .partitionBy("p")
                                .orc(data_path))
    
    # fallback to CPU only if aggregate is COUNT
    assert_cpu_and_gpu_are_equal_collect_with_capture(
            lambda spark: _do_orc_scan_with_agg(spark, data_path, aggregate),
            exist_classes="BatchScanExec",
            non_exist_classes="GpuBatchScanExec",
            conf=_orc_aggregate_pushdown_enabled_conf)
Beispiel #6
0
def test_missing_column_names_filter():
    if is_spark_300():
        pytest.skip("Apache Spark 3.0.0 does not handle ORC files without column names")

    with_cpu_session(setup_orc_file_no_column_names)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT _col3,_col2 FROM test_orc_data WHERE _col2 = '155'"))
Beispiel #7
0
def test_missing_column_names(spark_tmp_table_factory, reader_confs):
    table_name = spark_tmp_table_factory.get()
    with_cpu_session(
        lambda spark: setup_orc_file_no_column_names(spark, table_name))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.sql("SELECT _col3,_col2 FROM {}".format(table_name)
                                ), reader_confs)
Beispiel #8
0
def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list,
                                 reader_confs):
    # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed
    # we should go with a more standard set of generators
    parquet_gens = [
        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
        string_gen, boolean_gen,
        DateGen(start=date(1590, 1, 1)),
        TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc))
    ]
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    first_data_path = spark_tmp_path + '/PARQUET_DATA/key=0'
    with_cpu_session(
        lambda spark: gen_df(spark, gen_list).write.parquet(first_data_path),
        conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'LEGACY'})
    second_data_path = spark_tmp_path + '/PARQUET_DATA/key=1'
    with_cpu_session(
        lambda spark: gen_df(spark, gen_list).write.parquet(second_data_path),
        conf={
            'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED'
        })
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list})
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path), conf=all_confs)
Beispiel #9
0
def test_read_struct_without_stream(spark_tmp_path):
    data_gen = StructGen([['c_byte', ByteGen(nullable=False)]], nullable=False)
    data_path = spark_tmp_path + '/ORC_DATA'
    with_cpu_session(
            lambda spark : unary_op_df(spark, data_gen, 10).write.orc(data_path))
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : spark.read.orc(data_path))
Beispiel #10
0
def test_parquet_reading_from_unaligned_pages_all_types_dict_optimized(
        spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list):
    all_confs = copy_and_update(
        reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
    data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA'
    with_cpu_session(lambda spark : spark.range(0, 2000)\
            .selectExpr("id as _1",
                "cast(id % 10 as byte) as _2",
                "cast(id % 10 as short) as _3",
                "cast(id % 10 as int) as _4",
                "cast(id % 10 as float) as _5",
                "cast(id % 10 as double) as _6",
                # DECIMAL128 IS NOT SUPPORTED YET "cast(id % 10 as decimal(20,0)) as _7",
                "cast(id % 10 as decimal(10,0)) as _7",
                "cast(id % 10 as decimal(20,0)) as _8",
                "cast(id % 2 as boolean) as _9",
                "cast(cast(1618161925 + ((id % 10) * 60 * 60 * 24) as timestamp) as date) as _10",
                "cast(1618161925 + (id % 10) as timestamp) as _11")\
            .coalesce(1)\
            .write\
            .option("parquet.page.size", "4096")
            .option("parquet.enable.dictionary", enable_dictionary)
            .parquet(data_path))
    for filter_str in filters:
        assert_gpu_and_cpu_are_equal_collect(
            lambda spark: spark.read.parquet(data_path).filter(filter_str),
            all_confs)
Beispiel #11
0
def test_parquet_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list,
                                             reader_confs):
    # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed
    # we should go with a more standard set of generators
    parquet_gens = [
        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
        string_gen, boolean_gen,
        DateGen(start=date(1590, 1, 1)),
        TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc))
    ] + decimal_gens
    first_gen_list = [('_c' + str(i), gen)
                      for i, gen in enumerate(parquet_gens)]
    first_data_path = spark_tmp_path + '/PARQUET_DATA/key=0'
    with_cpu_session(lambda spark: gen_df(spark, first_gen_list).write.parquet(
        first_data_path),
                     conf=rebase_write_legacy_conf)
    second_gen_list = [(('_c' if i % 2 == 0 else '_b') + str(i), gen)
                       for i, gen in enumerate(parquet_gens)]
    second_data_path = spark_tmp_path + '/PARQUET_DATA/key=1'
    with_cpu_session(lambda spark: gen_df(spark, second_gen_list).write.
                     parquet(second_data_path),
                     conf=rebase_write_corrected_conf)
    all_confs = copy_and_update(
        reader_confs, {
            'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.parquet.mergeSchema': 'true'
        })
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path), conf=all_confs)
Beispiel #12
0
def test_host_columnar_transition(spark_tmp_path, data_gen):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
        lambda spark: unary_op_df(spark, data_gen).write.parquet(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path).filter("a IS NOT NULL"),
        conf={'spark.rapids.sql.exec.FileSourceScanExec': 'false'})
Beispiel #13
0
def test_explain_udf():
    slen = udf(lambda s: len(s), IntegerType())

    @udf
    def to_upper(s):
        if s is not None:
            return s.upper()

    @udf(returnType=IntegerType())
    def add_one(x):
        if x is not None:
            return x + 1

    def do_explain(spark):
        df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
        df2 = df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age"))
        explain_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df2._jdf, "ALL")
        # udf shouldn't be on GPU
        udf_str_not = 'cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec'
        assert udf_str_not in explain_str
        not_on_gpu_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df2._jdf, "NOT")
        assert udf_str_not in not_on_gpu_str
        assert "will run on GPU" not in not_on_gpu_str

    with_cpu_session(do_explain)
Beispiel #14
0
def test_read_schema_missing_cols(spark_tmp_path, v1_enabled_list, mt_opt):
    # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed
    # we should go with a more standard set of generators
    parquet_gens = [byte_gen, short_gen, int_gen, long_gen]
    first_gen_list = [('_c' + str(i), gen)
                      for i, gen in enumerate(parquet_gens)]
    first_data_path = spark_tmp_path + '/PARQUET_DATA/key=0'
    with_cpu_session(lambda spark: gen_df(spark, first_gen_list, 1).write.
                     parquet(first_data_path))
    # generate with 1 column less
    second_parquet_gens = [byte_gen, short_gen, int_gen]
    second_gen_list = [('_c' + str(i), gen)
                       for i, gen in enumerate(second_parquet_gens)]
    second_data_path = spark_tmp_path + '/PARQUET_DATA/key=1'
    with_cpu_session(lambda spark: gen_df(spark, second_gen_list, 1).write.
                     parquet(second_data_path))
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path),
        conf={
            'spark.rapids.sql.format.parquet.multiThreadedRead.enabled':
            mt_opt,
            'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.files.maxPartitionBytes': "1g",
            'spark.sql.files.minPartitionNum': '1'
        })
Beispiel #15
0
def test_disorder_read_schema(spark_tmp_table_factory, reader_confs):
    table_name = spark_tmp_table_factory.get()
    with_cpu_session(lambda spark : setup_orc_file_with_column_names(spark, table_name))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_2,c_1 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_3,c_1 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_3,c_2 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_1,c_3,c_2 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_1,c_2,c_3 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_2,c_1,c_3 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_2,c_3,c_1 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_3,c_1,c_2 FROM {}".format(table_name)),
        reader_confs)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT c_3,c_2,c_1 FROM {}".format(table_name)),
        reader_confs)
Beispiel #16
0
def test_select_float_order_local(sql_query_line, pytestconfig):
    sql_query = sql_query_line[0]
    if sql_query:
        print(sql_query)
        with_cpu_session(num_stringDf)
        assert_gpu_and_cpu_are_equal_collect(
            lambda spark: spark.sql(sql_query), conf=_qa_conf)
Beispiel #17
0
def assert_gpu_fallback_write(write_func,
        read_func,
        base_path,
        cpu_fallback_class_name,
        conf={}):
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    cpu_path = base_path + '/CPU'
    with_cpu_session(lambda spark : write_func(spark, cpu_path), conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    jvm = spark_jvm()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.startCapture()
    gpu_start = time.time()
    gpu_path = base_path + '/GPU'
    with_gpu_session(lambda spark : write_func(spark, gpu_path), conf=conf)
    gpu_end = time.time()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 2000)
    print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
        gpu_end - gpu_start, cpu_end - cpu_start))

    (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare(
            lambda spark: read_func(spark, cpu_path), 'COLLECT')
    (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare(
            lambda spark: read_func(spark, gpu_path), 'COLLECT')

    from_cpu = with_cpu_session(cpu_bring_back, conf=conf)
    from_gpu = with_cpu_session(gpu_bring_back, conf=conf)
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Beispiel #18
0
def test_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list, mt_opt):
    # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed
    # we should go with a more standard set of generators
    parquet_gens = [
        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
        string_gen, boolean_gen,
        DateGen(start=date(1590, 1, 1)),
        TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc))
    ]
    first_gen_list = [('_c' + str(i), gen)
                      for i, gen in enumerate(parquet_gens)]
    first_data_path = spark_tmp_path + '/PARQUET_DATA/key=0'
    with_cpu_session(
        lambda spark: gen_df(spark, first_gen_list).write.parquet(
            first_data_path),
        conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'LEGACY'})
    second_gen_list = [(('_c' if i % 2 == 0 else '_b') + str(i), gen)
                       for i, gen in enumerate(parquet_gens)]
    second_data_path = spark_tmp_path + '/PARQUET_DATA/key=1'
    with_cpu_session(lambda spark: gen_df(spark, second_gen_list).write.
                     parquet(second_data_path),
                     conf={
                         'spark.sql.legacy.parquet.datetimeRebaseModeInWrite':
                         'CORRECTED'
                     })
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path),
        conf={
            'spark.rapids.sql.format.parquet.multiThreadedRead.enabled':
            mt_opt,
            'spark.sql.parquet.mergeSchema': "true",
            'spark.sql.sources.useV1SourceList': v1_enabled_list
        })
Beispiel #19
0
def test_iceberg_fallback_not_unsafe_row(spark_tmp_table_factory):
    table = spark_tmp_table_factory.get()
    def setup_iceberg_table(spark):
        spark.sql("CREATE TABLE {} (id BIGINT, data STRING) USING ICEBERG".format(table))
        spark.sql("INSERT INTO {} VALUES (1, 'a'), (2, 'b'), (3, 'c')".format(table))
    with_cpu_session(setup_iceberg_table)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT COUNT(DISTINCT id) from {}".format(table)))
Beispiel #20
0
def test_select_first_last(sql_query_line, pytestconfig):
    sql_query = sql_query_line[0]
    if sql_query:
        print(sql_query)
        with_cpu_session(
            lambda spark: num_stringDf_first_last(spark, sql_query_line[2]))
        assert_gpu_and_cpu_are_equal_collect(
            lambda spark: spark.sql(sql_query), conf=_first_last_qa_conf)
Beispiel #21
0
def test_read_round_trip(spark_tmp_path, orc_gens, read_func, v1_enabled_list):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + '/ORC_DATA'
    with_cpu_session(
        lambda spark: gen_df(spark, gen_list).write.orc(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        read_func(data_path),
        conf={'spark.sql.sources.useV1SourceList': v1_enabled_list})
Beispiel #22
0
def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list):
    data_path = spark_tmp_path + '/ORC_DATA'
    with_cpu_session(
        lambda spark: binary_op_df(spark, long_gen).write.orc(data_path),
        conf={'spark.sql.orc.compression.codec': compress})
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.orc(data_path),
        conf={'spark.sql.sources.useV1SourceList': v1_enabled_list})
Beispiel #23
0
def test_parquet_push_down_on_interval_type(spark_tmp_path):
    gen_list = [('_c1', DayTimeIntervalGen())]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(lambda spark: gen_df(spark, gen_list).coalesce(1).write.
                     parquet(data_path))
    assert_gpu_and_cpu_are_equal_sql(
        lambda spark: spark.read.parquet(data_path), "testData",
        "select * from testData where _c1 > interval '10 0:0:0' day to second")
Beispiel #24
0
def test_read_round_trip_legacy(spark_tmp_path, parquet_gens):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
        lambda spark: gen_df(spark, gen_list).write.parquet(data_path),
        conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'LEGACY'})
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path))
Beispiel #25
0
def test_round_trip(spark_tmp_path, data_gen):
    gen = StructGen([('a', data_gen)], nullable=False)
    data_path = spark_tmp_path + '/CSV_DATA'
    schema = gen.data_type
    with_cpu_session(lambda spark: gen_df(spark, gen).write.csv(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.schema(schema).csv(data_path),
        conf=_enable_ts_conf)
Beispiel #26
0
def test_parquet_read_daytime_interval_cpu_file(spark_tmp_path):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    gen_list = [('_c1', DayTimeIntervalGen())]
    # write DayTimeInterval with CPU
    with_cpu_session(lambda spark: gen_df(spark, gen_list).coalesce(1).write.
                     mode("overwrite").parquet(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path))
Beispiel #27
0
def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func):
    data_path = spark_tmp_path + '/ORC_DATA'
    gen_list = [('a', RepeatSeqGen(orc_gen, 100)), ('b', orc_gen)]
    s0 = gen_scalar(orc_gen, force_no_nulls=True)
    with_cpu_session(lambda spark: gen_df(spark, gen_list).orderBy('a').write.
                     orc(data_path))
    rf = read_func(data_path)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: rf(spark).select(f.col('a') >= s0))
Beispiel #28
0
def test_null_literal(spark_tmp_path, data_gen):
    # Write data to Parquet so Spark generates a plan using just the count of the data.
    data_path = spark_tmp_path + '/AST_TEST_DATA'
    with_cpu_session(lambda spark: gen_df(spark, [("a", IntegerGen())]).write.
                     parquet(data_path))
    data_type = data_gen.data_type
    assert_gpu_ast(is_supported=True,
                   func=lambda spark: spark.read.parquet(data_path).select(
                       f.lit(None).cast(data_type)))
Beispiel #29
0
def test_literal(spark_tmp_path, data_gen):
    # Write data to Parquet so Spark generates a plan using just the count of the data.
    data_path = spark_tmp_path + '/AST_TEST_DATA'
    with_cpu_session(lambda spark: gen_df(spark, [("a", IntegerGen())]).write.
                     parquet(data_path))
    scalar = gen_scalar(data_gen, force_no_nulls=True)
    assert_gpu_ast(
        is_supported=True,
        func=lambda spark: spark.read.parquet(data_path).select(scalar))
Beispiel #30
0
def test_decimal_read_legacy(spark_tmp_path, parquet_gens, read_func, reader_confs, v1_enabled_list):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
            lambda spark : gen_df(spark, gen_list).write.parquet(data_path),
            conf={'spark.sql.parquet.writeLegacyFormat': 'true'})
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list})
    assert_gpu_and_cpu_are_equal_collect(read_func(data_path), conf=all_confs)