Example #1
0
def test_part_write_round_trip(spark_tmp_path, orc_gen):
    gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)]
    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(
            1).write.partitionBy('a').orc(path),
        lambda spark, path: spark.read.orc(path), data_path)
Example #2
0
def test_compress_write_round_trip(spark_tmp_path, compress):
    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.orc(path),
            lambda spark, path : spark.read.orc(path),
            data_path,
            conf={'spark.sql.orc.compression.codec': compress})
def test_roundtrip_with_rebase_values(spark_tmp_path, ts_write_data_gen,
                                      date_time_rebase_read,
                                      date_time_rebase_write,
                                      int96_rebase_read, int96_rebase_write):
    ts_write, gen = ts_write_data_gen
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = {'spark.sql.parquet.outputTimestampType': ts_write}
    all_confs.update({
        'spark.sql.legacy.parquet.datetimeRebaseModeInWrite':
        date_time_rebase_write,
        'spark.sql.legacy.parquet.int96RebaseModeInWrite':
        int96_rebase_write
    })
    all_confs.update({
        'spark.sql.legacy.parquet.datetimeRebaseModeInRead':
        date_time_rebase_read,
        'spark.sql.legacy.parquet.int96RebaseModeInRead':
        int96_rebase_read
    })

    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: unary_op_df(spark, gen).coalesce(1).write.parquet(
            path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=all_confs)
Example #4
0
def test_write_round_trip(spark_tmp_path, orc_gens):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
            lambda spark, path: spark.read.orc(path),
            data_path)
Example #5
0
def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: write_orc_sql_from(spark, gen_df(spark, gen_list).coalesce(1), path, spark_tmp_table_factory.get()),
            lambda spark, path: spark.read.orc(path),
            data_path,
            conf={'spark.sql.orc.impl': orc_impl})
Example #6
0
def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
            lambda spark, path: spark.read.orc(path),
            data_path,
            conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
Example #7
0
def test_compress_write_round_trip(spark_tmp_path, compress, mt_opt, v1_enabled_list):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.parquet(path),
            lambda spark, path : spark.read.parquet(path),
            data_path,
            conf={'spark.sql.parquet.compression.codec': compress,
                'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt,
                'spark.sql.sources.useV1SourceList': v1_enabled_list})
Example #8
0
def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase):
    gen = TimestampGen()
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: unary_op_df(spark, gen).write.parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
              'spark.sql.parquet.outputTimestampType': ts_type})
def test_compress_write_round_trip(spark_tmp_path, compress):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = {'spark.sql.parquet.compression.codec': compress}
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: binary_op_df(spark, long_gen).coalesce(
            1).write.parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=all_confs)
Example #10
0
def test_write_round_trip(spark_tmp_path, parquet_gens):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED',
                'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS'})
def test_part_write_round_trip(spark_tmp_path, parquet_gen):
    gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(
            1).write.partitionBy('a').parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=writer_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(
            path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=writer_confs)
def test_write_daytime_interval(spark_tmp_path):
    gen_list = [('_c1', DayTimeIntervalGen())]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(
            path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=writer_confs)
def test_compress_write_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.parquet.compression.codec': compress})
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.parquet(path),
            lambda spark, path : spark.read.parquet(path),
            data_path,
            conf=all_confs)
Example #15
0
def test_part_write_round_trip(spark_tmp_path, parquet_gen):
    gen_list = [('a', RepeatSeqGen(parquet_gen, 10)),
            ('b', parquet_gen)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.partitionBy('a').parquet(path),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED',
                'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS'})
def test_write_sql_save_table(spark_tmp_path, parquet_gens, ts_type, spark_tmp_table_factory):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs={'spark.sql.parquet.outputTimestampType': ts_type}
    all_confs.update(writer_confs)
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: write_parquet_sql_from(spark, gen_df(spark, gen_list).coalesce(1), path, spark_tmp_table_factory.get()),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf=all_confs)
Example #17
0
def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
    data_path = spark_tmp_path + '/ORC_DATA'
    all_confs={'spark.sql.sources.useV1SourceList': "orc",
               "spark.sql.orc.impl": orc_impl}
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.format("orc").mode('overwrite').option("path", path).saveAsTable(spark_tmp_table_factory.get()),
            lambda spark, path: spark.read.orc(path),
            data_path,
            conf=all_confs)
def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = copy_and_update(
        writer_confs, {'spark.sql.parquet.outputTimestampType': ts_type})
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(
            path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=all_confs)
Example #19
0
def test_write_round_trip(spark_tmp_path, parquet_gens, mt_opt, v1_enabled_list, ts_type):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED',
                'spark.sql.parquet.outputTimestampType': ts_type,
                'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt,
                'spark.sql.sources.useV1SourceList': v1_enabled_list})
Example #20
0
def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens):
    def create_empty_df(spark, path):
        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
        return gen_df(spark, gen_list, length=0).write.orc(path)

    data_path = spark_tmp_path + '/ORC_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        create_empty_df,
        lambda spark, path: spark.read.orc(path),
        data_path,
        conf={'spark.rapids.sql.format.orc.write.enabled': True})
def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens):
    def create_empty_df(spark, path):
        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
        return gen_df(spark, gen_list, length=0).write.parquet(path)

    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        create_empty_df,
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=writer_confs)
def test_write_save_table(spark_tmp_path, parquet_gens,
                          spark_tmp_table_factory):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).
        coalesce(1).write.format("parquet").mode('overwrite').option(
            "path", path).saveAsTable(spark_tmp_table_factory.get()),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=writer_confs)
def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase):
    # we are limiting TimestampGen to avoid overflowing the INT96 value
    # see https://github.com/rapidsai/cudf/issues/8070
    gen = TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc))
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: unary_op_df(spark, gen).write.parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
              'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase,
              'spark.sql.parquet.outputTimestampType': ts_type})
def test_part_write_round_trip(spark_tmp_path, parquet_gen, v1_enabled_list, ts_type, reader_confs):
    gen_list = [('a', RepeatSeqGen(parquet_gen, 10)),
            ('b', parquet_gen)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.parquet.outputTimestampType': ts_type})
    all_confs.update(writer_confs)
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.partitionBy('a').parquet(path),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf=all_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens, v1_enabled_list, ts_type,
                                  reader_confs):
    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list,
            'spark.sql.parquet.outputTimestampType': ts_type})
    all_confs.update(writer_confs)
    assert_gpu_and_cpu_writes_are_equal_collect(
            lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path),
            lambda spark, path: spark.read.parquet(path),
            data_path,
            conf=all_confs)
def test_all_null_int96(spark_tmp_path):
    class AllNullTimestampGen(TimestampGen):
        def start(self, rand):
            self._start(rand, lambda: None)

    data_path = spark_tmp_path + '/PARQUET_DATA'
    confs = copy_and_update(writer_confs,
                            {'spark.sql.parquet.outputTimestampType': 'INT96'})
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: unary_op_df(spark, AllNullTimestampGen()).coalesce(
            1).write.parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf=confs)
Example #27
0
def test_part_write_round_trip(spark_tmp_path, parquet_gen, mt_opt,
                               v1_enabled_list):
    gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)]
    data_path = spark_tmp_path + '/PARQUET_DATA'
    assert_gpu_and_cpu_writes_are_equal_collect(
        lambda spark, path: gen_df(spark, gen_list).coalesce(
            1).write.partitionBy('a').parquet(path),
        lambda spark, path: spark.read.parquet(path),
        data_path,
        conf={
            'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED',
            'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS',
            'spark.rapids.sql.format.parquet.multiThreadedRead.enabled':
            mt_opt,
            'spark.sql.sources.useV1SourceList': v1_enabled_list
        })
def test_write_map_nullable(spark_tmp_path):
    data_path = spark_tmp_path + '/PARQUET_DATA'

    def generate_map_with_empty_validity(spark, path):
        gen_data = StructGen(
            [['number', IntegerGen()], ['word', LongGen()]], nullable=False)
        gen_df(spark, gen_data)
        df = gen_df(spark, gen_data)
        df_noNulls = df.filter("number is not null")
        df_map = df_noNulls.withColumn("map", f.create_map(
            ["number", "word"])).drop("number").drop("word")
        df_map.coalesce(1).write.parquet(path)

    assert_gpu_and_cpu_writes_are_equal_collect(
        generate_map_with_empty_validity,
        lambda spark, path: spark.read.parquet(path), data_path)