Ejemplo n.º 1
0
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen,
                                           ts_rebase, spark_tmp_table_factory):
    ts_write, gen = ts_write_data_gen
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_gpu_session(lambda spark: writeParquetUpgradeCatchException(
        spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory,
        ts_rebase, ts_write))
Ejemplo n.º 2
0
def assert_gpu_fallback_write(write_func,
        read_func,
        base_path,
        cpu_fallback_class_name,
        conf={}):
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    cpu_path = base_path + '/CPU'
    with_cpu_session(lambda spark : write_func(spark, cpu_path), conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    jvm = spark_jvm()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.startCapture()
    gpu_start = time.time()
    gpu_path = base_path + '/GPU'
    with_gpu_session(lambda spark : write_func(spark, gpu_path), conf=conf)
    gpu_end = time.time()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 2000)
    print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
        gpu_end - gpu_start, cpu_end - cpu_start))

    (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare(
            lambda spark: read_func(spark, cpu_path), 'COLLECT')
    (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare(
            lambda spark: read_func(spark, gpu_path), 'COLLECT')

    from_cpu = with_cpu_session(cpu_bring_back, conf=conf)
    from_gpu = with_cpu_session(gpu_bring_back, conf=conf)
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 3
0
def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory,
                        allow_non_empty):
    data_path = spark_tmp_path + "/CTAS"
    conf = {
        "spark.sql.hive.convertCTAS": "true",
        "spark.sql.legacy.allowNonEmptyLocationInCTAS": str(allow_non_empty)
    }

    def test_it(spark):
        src_name = spark_tmp_table_factory.get()
        spark.sql("CREATE TABLE {}(id string) LOCATION '{}/src1'".format(
            src_name, data_path))
        spark.sql("INSERT INTO TABLE {} SELECT 'A'".format(src_name))
        ctas1_name = spark_tmp_table_factory.get()
        spark.sql("CREATE TABLE {}(id string) LOCATION '{}/ctas/ctas1'".format(
            ctas1_name, data_path))
        spark.sql("INSERT INTO TABLE {} SELECT 'A'".format(ctas1_name))
        try:
            ctas_with_existing_name = spark_tmp_table_factory.get()
            spark.sql("CREATE TABLE {} LOCATION '{}/ctas' AS SELECT * FROM {}".
                      format(ctas_with_existing_name, data_path, src_name))
        except pyspark.sql.utils.AnalysisException as e:
            if allow_non_empty or e.desc.find('non-empty directory') == -1:
                raise e

    with_gpu_session(test_it, conf)
Ejemplo n.º 4
0
def _assert_gpu_and_cpu_writes_are_equal(write_func,
                                         read_func,
                                         base_path,
                                         mode,
                                         conf={}):
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    cpu_path = base_path + '/CPU'
    with_cpu_session(lambda spark: write_func(spark, cpu_path), conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    gpu_start = time.time()
    gpu_path = base_path + '/GPU'
    with_gpu_session(lambda spark: write_func(spark, gpu_path), conf=conf)
    gpu_end = time.time()
    print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
        gpu_end - gpu_start, cpu_end - cpu_start))

    (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare(
        lambda spark: read_func(spark, cpu_path), mode)
    (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare(
        lambda spark: read_func(spark, gpu_path), mode)

    from_cpu = with_cpu_session(cpu_bring_back, conf=conf)
    from_gpu = with_cpu_session(gpu_bring_back, conf=conf)
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 5
0
def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory):
    gen = IntegerGen()
    data_path = spark_tmp_path + '/PARQUET_DATA'
    table_name = spark_tmp_table_factory.get()
    with_gpu_session(
            lambda spark : unary_op_df(spark, gen).coalesce(1).write.format("parquet").mode('overwrite').option("path", data_path).saveAsTable(table_name))
    with_gpu_session(
            lambda spark : writeParquetNoOverwriteCatchException(spark, unary_op_df(spark, gen), data_path, table_name))
Ejemplo n.º 6
0
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write, ts_rebase,
                                           spark_tmp_table_factory):
    gen = TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc),
                       end=datetime(1582, 1, 1, tzinfo=timezone.utc))
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_gpu_session(lambda spark: writeParquetUpgradeCatchException(
        spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory,
        ts_rebase, ts_write))
Ejemplo n.º 7
0
def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    gen_list = [('_c1', DayTimeIntervalGen())]
    # write DayTimeInterval with GPU
    with_gpu_session(lambda spark: gen_df(spark, gen_list).coalesce(1).write.
                     mode("overwrite").parquet(data_path))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: spark.read.parquet(data_path))
Ejemplo n.º 8
0
def test_cache_array(enable_vectorized):
    def helper(spark):
        data = [("aaa", "123 456 789"), ("bbb", "444 555 666"), ("ccc", "777 888 999")]
        columns = ["a","b"]
        df = spark.createDataFrame(data).toDF(*columns)
        newdf = df.withColumn('newb', f.split(f.col('b'),' '))
        newdf.persist()
        return newdf.count()

    with_gpu_session(helper, conf = enable_vectorized)
Ejemplo n.º 9
0
def test_int96_write_conf(spark_tmp_path, data_gen):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    confs = copy_and_update(
        writer_confs, {
            'spark.sql.parquet.outputTimestampType': 'INT96',
            'spark.rapids.sql.format.parquet.writer.int96.enabled': 'false'
        })
    with_gpu_session(lambda spark: unary_op_df(spark, data_gen).coalesce(1).
                     write.parquet(data_path),
                     conf=confs)
Ejemplo n.º 10
0
def test_ts_read_fails_datetime_legacy(gen, spark_tmp_path, ts_write, ts_rebase, v1_enabled_list, reader_confs):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
            lambda spark : unary_op_df(spark, gen).write.parquet(data_path),
            conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase,
                'spark.sql.parquet.outputTimestampType': ts_write})
    all_confs = reader_confs.copy()
    all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list})
    with_gpu_session(
            lambda spark : readParquetCatchException(spark, data_path),
            conf=all_confs)
Ejemplo n.º 11
0
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen,
                                           spark_tmp_table_factory, rebase):
    ts_write, gen = ts_write_data_gen
    data_path = spark_tmp_path + '/PARQUET_DATA'
    int96_rebase = "EXCEPTION" if (ts_write == "INT96") else rebase
    date_time_rebase = "EXCEPTION" if (ts_write
                                       == "TIMESTAMP_MICROS") else rebase
    with_gpu_session(lambda spark: writeParquetUpgradeCatchException(
        spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory,
        int96_rebase, date_time_rebase, ts_write))
    with_cpu_session(lambda spark: writeParquetUpgradeCatchException(
        spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory,
        int96_rebase, date_time_rebase, ts_write))
Ejemplo n.º 12
0
def test_ts_read_fails_datetime_legacy(spark_tmp_path, ts_write, ts_rebase, mt_opt, v1_enabled_list):
    # Once https://github.com/NVIDIA/spark-rapids/issues/132 is fixed replace this with
    # timestamp_gen
    gen = TimestampGen(start=datetime(1590, 1, 1, tzinfo=timezone.utc))
    data_path = spark_tmp_path + '/PARQUET_DATA'
    with_cpu_session(
            lambda spark : unary_op_df(spark, gen).write.parquet(data_path),
            conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                'spark.sql.parquet.outputTimestampType': ts_write})
    with_gpu_session(
            lambda spark : readParquetCatchException(spark, data_path),
            conf={'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt,
                  'spark.sql.sources.useV1SourceList': v1_enabled_list})
Ejemplo n.º 13
0
def test_explain_bucket_column_not_read(spark_tmp_table_factory):
    """
    Test the physical plan includes the info of disabling bucketed scan and the reason.
    The code is copied from:
    https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR702
    """
    def do_explain(spark):
        tbl = spark_tmp_table_factory.get()
        spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl)
        df = spark.table(tbl).select(f.col("j"))

        assert "Bucketed: false (bucket column(s) not read)" in df._sc._jvm.PythonSQLUtils.explainString(df._jdf.queryExecution(), "simple")

    with_gpu_session(do_explain)
Ejemplo n.º 14
0
def assert_cpu_and_gpu_are_equal_collect_with_capture(func,
        exist_classes='',
        non_exist_classes='',
        conf={}):
    (bring_back, collect_type) = _prep_func_for_compare(func, 'COLLECT_WITH_DATAFRAME')

    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    from_cpu, cpu_df = with_cpu_session(bring_back, conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    gpu_start = time.time()
    from_gpu, gpu_df = with_gpu_session(bring_back, conf=conf)
    gpu_end = time.time()
    jvm = spark_jvm()
    if exist_classes:
        for clz in exist_classes.split(','):
            jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertContains(gpu_df._jdf, clz)
    if non_exist_classes:
        for clz in non_exist_classes.split(','):
            jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertNotContain(gpu_df._jdf, clz)
    print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type,
        gpu_end - gpu_start, cpu_end - cpu_start))
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 15
0
def test_explain_bucket_disabled_by_query_planner(spark_tmp_table_factory):
    """
    Test the physical plan includes the info of disabling bucketed scan and the reason.
    The code is copied from:
    https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR700

    This test will be skipped if spark version is before 3.1.0. Because the attribute `disableBucketedScan` is not included in `GpuFileSourceScanExec` before 3.1.0.
    """
    def do_explain(spark):
        tbl = spark_tmp_table_factory.get()
        spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl)
        df = spark.table(tbl)

        assert "Bucketed: false (disabled by query planner)" in df._sc._jvm.PythonSQLUtils.explainString(df._jdf.queryExecution(), "simple")

    with_gpu_session(do_explain)
Ejemplo n.º 16
0
def assert_gpu_fallback_collect(func,
        cpu_fallback_class_name,
        conf={}):
    (bring_back, collect_type) = _prep_func_for_compare(func, 'COLLECT')
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    from_cpu = with_cpu_session(bring_back, conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    jvm = spark_jvm()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.startCapture()
    gpu_start = time.time()
    from_gpu = with_gpu_session(bring_back,
            conf=conf)
    gpu_end = time.time()
    jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 2000)
    print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type,
        gpu_end - gpu_start, cpu_end - cpu_start))
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 17
0
 def run_on_gpu():
     print('### GPU RUN ###')
     global gpu_start
     gpu_start = time.time()
     global from_gpu
     from_gpu = with_gpu_session(bring_back, conf=conf)
     global gpu_end
     gpu_end = time.time()
Ejemplo n.º 18
0
def test_explain_bucketd_scan(spark_tmp_table_factory):
    """
    Test the physical plan includes the info of enabling bucketed scan.
    The code is copied from: 
    https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR688
    """
    def do_explain(spark):
        tbl_1 = spark_tmp_table_factory.get()
        tbl_2 = spark_tmp_table_factory.get()
        spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl_1)
        spark.createDataFrame([(2,), (3,)], ("i",)).write.bucketBy(8, "i").saveAsTable(tbl_2)
        df1 = spark.table(tbl_1)
        df2 = spark.table(tbl_2)
        joined_df = df1.join(df2, df1.i == df2.i , "inner")

        assert "Bucketed: true" in joined_df._sc._jvm.PythonSQLUtils.explainString(joined_df._jdf.queryExecution(), "simple")
    
    with_gpu_session(do_explain, {"spark.sql.autoBroadcastJoinThreshold": "0"})
Ejemplo n.º 19
0
def test_catch_int96_overflow(spark_tmp_path, data_gen):
    data_path = spark_tmp_path + '/PARQUET_DATA'
    confs = copy_and_update(writer_confs,
                            {'spark.sql.parquet.outputTimestampType': 'INT96'})
    assert_py4j_exception(
        lambda: with_gpu_session(lambda spark: unary_op_df(spark, data_gen).
                                 coalesce(1).write.parquet(data_path),
                                 conf=confs),
        "org.apache.spark.SparkException: Job aborted.")
Ejemplo n.º 20
0
def assert_gpu_and_cpu_error(df_fun, conf, error_message):
    """
    Assert that GPU and CPU execution results in a specific Java exception thrown
    :param df_fun: a function to be verified
    :param conf: Spark config
    :param error_message: a string such as the one produce by java.lang.Exception.toString
    :return: Assertion failure if either GPU or CPU versions has not generated error messages
             expected
    """
    assert_py4j_exception(lambda: with_cpu_session(df_fun, conf), error_message)
    assert_py4j_exception(lambda: with_gpu_session(df_fun, conf), error_message)
Ejemplo n.º 21
0
def test_json_read_with_corrupt_files(spark_tmp_path, v1_enabled_list):
    first_data_path = spark_tmp_path + '/JSON_DATA/first'
    with_cpu_session(
        lambda spark: spark.range(1).toDF("a").write.json(first_data_path))
    second_data_path = spark_tmp_path + '/JSON_DATA/second'
    with_cpu_session(
        lambda spark: spark.range(1, 2).toDF("a").write.orc(second_data_path))
    third_data_path = spark_tmp_path + '/JSON_DATA/third'
    with_cpu_session(
        lambda spark: spark.range(2, 3).toDF("a").write.json(third_data_path))

    all_confs = copy_and_update(
        _enable_all_types_conf, {
            'spark.sql.files.ignoreCorruptFiles': "true",
            'spark.sql.sources.useV1SourceList': v1_enabled_list
        })
    schema = StructType([StructField("a", IntegerType())])

    # when ignoreCorruptFiles is enabled, gpu reading should not throw exception, while CPU can successfully
    # read the three files without ignore corrupt files. So we just check if GPU will throw exception.
    with_gpu_session(lambda spark: spark.read.schema(schema).json(
        [first_data_path, second_data_path, third_data_path]).collect(),
                     conf=all_confs)
Ejemplo n.º 22
0
def _assert_cpu_gpu(cpu_func, gpu_func, cpu_conf={}, gpu_conf={}, is_sort=False):
    print('### CPU RUN ###')
    cpu_start = time.time()
    cpu_ret = with_cpu_session(cpu_func, conf=cpu_conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    gpu_start = time.time()
    gpu_ret = with_gpu_session(gpu_func, conf=gpu_conf)
    gpu_end = time.time()
    print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format(
        gpu_end - gpu_start, cpu_end - cpu_start))
    if is_sort:
        assert cpu_ret.sort() == gpu_ret.sort()
    else:
        assert cpu_ret == gpu_ret
Ejemplo n.º 23
0
def test_cache_posexplode_makearray(spark_tmp_path, data_gen):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")
    data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU'
    data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU'

    def write_posExplode(data_path):
        def posExplode(spark):
            cached = four_op_df(spark, data_gen).selectExpr(
                'posexplode(array(b, c, d))', 'a').cache()
            cached.count()
            cached.write.parquet(data_path)
            spark.read.parquet(data_path)

        return posExplode

    from_cpu = with_cpu_session(write_posExplode(data_path_cpu))
    from_gpu = with_gpu_session(write_posExplode(data_path_gpu))
    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 24
0
def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")
    data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU'
    data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU'
    def write_posExplode(data_path):
        def posExplode(spark):
            cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache()
            cached.count()
            cached.write.parquet(data_path)
            spark.read.parquet(data_path)
        return posExplode
    from_cpu = with_cpu_session(write_posExplode(data_path_cpu),
                 conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                       'spark.sql.parquet.outputTimestampType': ts_write})
    from_gpu = with_gpu_session(write_posExplode(data_path_gpu),
                  conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                        'spark.sql.parquet.outputTimestampType': ts_write})
    assert_equal(from_cpu, from_gpu)
Ejemplo n.º 25
0
def _assert_gpu_and_cpu_are_equal(func, should_collect, conf={}):
    (bring_back, collect_type) = _prep_func_for_compare(func, should_collect)
    conf = _prep_incompat_conf(conf)

    print('### CPU RUN ###')
    cpu_start = time.time()
    from_cpu = with_cpu_session(bring_back, conf=conf)
    cpu_end = time.time()
    print('### GPU RUN ###')
    gpu_start = time.time()
    from_gpu = with_gpu_session(bring_back, conf=conf)
    gpu_end = time.time()
    print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type,
                                                       gpu_end - gpu_start,
                                                       cpu_end - cpu_start))
    if should_sort_locally():
        from_cpu.sort(key=_RowCmp)
        from_gpu.sort(key=_RowCmp)

    assert_equal(from_cpu, from_gpu)