def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, ts_rebase, spark_tmp_table_factory): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' with_gpu_session(lambda spark: writeParquetUpgradeCatchException( spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory, ts_rebase, ts_write))
def assert_gpu_fallback_write(write_func, read_func, base_path, cpu_fallback_class_name, conf={}): conf = _prep_incompat_conf(conf) print('### CPU RUN ###') cpu_start = time.time() cpu_path = base_path + '/CPU' with_cpu_session(lambda spark : write_func(spark, cpu_path), conf=conf) cpu_end = time.time() print('### GPU RUN ###') jvm = spark_jvm() jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.startCapture() gpu_start = time.time() gpu_path = base_path + '/GPU' with_gpu_session(lambda spark : write_func(spark, gpu_path), conf=conf) gpu_end = time.time() jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 2000) print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format( gpu_end - gpu_start, cpu_end - cpu_start)) (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare( lambda spark: read_func(spark, cpu_path), 'COLLECT') (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare( lambda spark: read_func(spark, gpu_path), 'COLLECT') from_cpu = with_cpu_session(cpu_bring_back, conf=conf) from_gpu = with_cpu_session(gpu_bring_back, conf=conf) if should_sort_locally(): from_cpu.sort(key=_RowCmp) from_gpu.sort(key=_RowCmp) assert_equal(from_cpu, from_gpu)
def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty): data_path = spark_tmp_path + "/CTAS" conf = { "spark.sql.hive.convertCTAS": "true", "spark.sql.legacy.allowNonEmptyLocationInCTAS": str(allow_non_empty) } def test_it(spark): src_name = spark_tmp_table_factory.get() spark.sql("CREATE TABLE {}(id string) LOCATION '{}/src1'".format( src_name, data_path)) spark.sql("INSERT INTO TABLE {} SELECT 'A'".format(src_name)) ctas1_name = spark_tmp_table_factory.get() spark.sql("CREATE TABLE {}(id string) LOCATION '{}/ctas/ctas1'".format( ctas1_name, data_path)) spark.sql("INSERT INTO TABLE {} SELECT 'A'".format(ctas1_name)) try: ctas_with_existing_name = spark_tmp_table_factory.get() spark.sql("CREATE TABLE {} LOCATION '{}/ctas' AS SELECT * FROM {}". format(ctas_with_existing_name, data_path, src_name)) except pyspark.sql.utils.AnalysisException as e: if allow_non_empty or e.desc.find('non-empty directory') == -1: raise e with_gpu_session(test_it, conf)
def _assert_gpu_and_cpu_writes_are_equal(write_func, read_func, base_path, mode, conf={}): conf = _prep_incompat_conf(conf) print('### CPU RUN ###') cpu_start = time.time() cpu_path = base_path + '/CPU' with_cpu_session(lambda spark: write_func(spark, cpu_path), conf=conf) cpu_end = time.time() print('### GPU RUN ###') gpu_start = time.time() gpu_path = base_path + '/GPU' with_gpu_session(lambda spark: write_func(spark, gpu_path), conf=conf) gpu_end = time.time() print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format( gpu_end - gpu_start, cpu_end - cpu_start)) (cpu_bring_back, cpu_collect_type) = _prep_func_for_compare( lambda spark: read_func(spark, cpu_path), mode) (gpu_bring_back, gpu_collect_type) = _prep_func_for_compare( lambda spark: read_func(spark, gpu_path), mode) from_cpu = with_cpu_session(cpu_bring_back, conf=conf) from_gpu = with_cpu_session(gpu_bring_back, conf=conf) if should_sort_locally(): from_cpu.sort(key=_RowCmp) from_gpu.sort(key=_RowCmp) assert_equal(from_cpu, from_gpu)
def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory): gen = IntegerGen() data_path = spark_tmp_path + '/PARQUET_DATA' table_name = spark_tmp_table_factory.get() with_gpu_session( lambda spark : unary_op_df(spark, gen).coalesce(1).write.format("parquet").mode('overwrite').option("path", data_path).saveAsTable(table_name)) with_gpu_session( lambda spark : writeParquetNoOverwriteCatchException(spark, unary_op_df(spark, gen), data_path, table_name))
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write, ts_rebase, spark_tmp_table_factory): gen = TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1582, 1, 1, tzinfo=timezone.utc)) data_path = spark_tmp_path + '/PARQUET_DATA' with_gpu_session(lambda spark: writeParquetUpgradeCatchException( spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory, ts_rebase, ts_write))
def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] # write DayTimeInterval with GPU with_gpu_session(lambda spark: gen_df(spark, gen_list).coalesce(1).write. mode("overwrite").parquet(data_path)) assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.parquet(data_path))
def test_cache_array(enable_vectorized): def helper(spark): data = [("aaa", "123 456 789"), ("bbb", "444 555 666"), ("ccc", "777 888 999")] columns = ["a","b"] df = spark.createDataFrame(data).toDF(*columns) newdf = df.withColumn('newb', f.split(f.col('b'),' ')) newdf.persist() return newdf.count() with_gpu_session(helper, conf = enable_vectorized)
def test_int96_write_conf(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update( writer_confs, { 'spark.sql.parquet.outputTimestampType': 'INT96', 'spark.rapids.sql.format.parquet.writer.int96.enabled': 'false' }) with_gpu_session(lambda spark: unary_op_df(spark, data_gen).coalesce(1). write.parquet(data_path), conf=confs)
def test_ts_read_fails_datetime_legacy(gen, spark_tmp_path, ts_write, ts_rebase, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' with_cpu_session( lambda spark : unary_op_df(spark, gen).write.parquet(data_path), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) all_confs = reader_confs.copy() all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list}) with_gpu_session( lambda spark : readParquetCatchException(spark, data_path), conf=all_confs)
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' int96_rebase = "EXCEPTION" if (ts_write == "INT96") else rebase date_time_rebase = "EXCEPTION" if (ts_write == "TIMESTAMP_MICROS") else rebase with_gpu_session(lambda spark: writeParquetUpgradeCatchException( spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory, int96_rebase, date_time_rebase, ts_write)) with_cpu_session(lambda spark: writeParquetUpgradeCatchException( spark, unary_op_df(spark, gen), data_path, spark_tmp_table_factory, int96_rebase, date_time_rebase, ts_write))
def test_ts_read_fails_datetime_legacy(spark_tmp_path, ts_write, ts_rebase, mt_opt, v1_enabled_list): # Once https://github.com/NVIDIA/spark-rapids/issues/132 is fixed replace this with # timestamp_gen gen = TimestampGen(start=datetime(1590, 1, 1, tzinfo=timezone.utc)) data_path = spark_tmp_path + '/PARQUET_DATA' with_cpu_session( lambda spark : unary_op_df(spark, gen).write.parquet(data_path), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) with_gpu_session( lambda spark : readParquetCatchException(spark, data_path), conf={'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt, 'spark.sql.sources.useV1SourceList': v1_enabled_list})
def test_explain_bucket_column_not_read(spark_tmp_table_factory): """ Test the physical plan includes the info of disabling bucketed scan and the reason. The code is copied from: https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR702 """ def do_explain(spark): tbl = spark_tmp_table_factory.get() spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl) df = spark.table(tbl).select(f.col("j")) assert "Bucketed: false (bucket column(s) not read)" in df._sc._jvm.PythonSQLUtils.explainString(df._jdf.queryExecution(), "simple") with_gpu_session(do_explain)
def assert_cpu_and_gpu_are_equal_collect_with_capture(func, exist_classes='', non_exist_classes='', conf={}): (bring_back, collect_type) = _prep_func_for_compare(func, 'COLLECT_WITH_DATAFRAME') conf = _prep_incompat_conf(conf) print('### CPU RUN ###') cpu_start = time.time() from_cpu, cpu_df = with_cpu_session(bring_back, conf=conf) cpu_end = time.time() print('### GPU RUN ###') gpu_start = time.time() from_gpu, gpu_df = with_gpu_session(bring_back, conf=conf) gpu_end = time.time() jvm = spark_jvm() if exist_classes: for clz in exist_classes.split(','): jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertContains(gpu_df._jdf, clz) if non_exist_classes: for clz in non_exist_classes.split(','): jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertNotContain(gpu_df._jdf, clz) print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type, gpu_end - gpu_start, cpu_end - cpu_start)) if should_sort_locally(): from_cpu.sort(key=_RowCmp) from_gpu.sort(key=_RowCmp) assert_equal(from_cpu, from_gpu)
def test_explain_bucket_disabled_by_query_planner(spark_tmp_table_factory): """ Test the physical plan includes the info of disabling bucketed scan and the reason. The code is copied from: https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR700 This test will be skipped if spark version is before 3.1.0. Because the attribute `disableBucketedScan` is not included in `GpuFileSourceScanExec` before 3.1.0. """ def do_explain(spark): tbl = spark_tmp_table_factory.get() spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl) df = spark.table(tbl) assert "Bucketed: false (disabled by query planner)" in df._sc._jvm.PythonSQLUtils.explainString(df._jdf.queryExecution(), "simple") with_gpu_session(do_explain)
def assert_gpu_fallback_collect(func, cpu_fallback_class_name, conf={}): (bring_back, collect_type) = _prep_func_for_compare(func, 'COLLECT') conf = _prep_incompat_conf(conf) print('### CPU RUN ###') cpu_start = time.time() from_cpu = with_cpu_session(bring_back, conf=conf) cpu_end = time.time() print('### GPU RUN ###') jvm = spark_jvm() jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.startCapture() gpu_start = time.time() from_gpu = with_gpu_session(bring_back, conf=conf) gpu_end = time.time() jvm.com.nvidia.spark.rapids.ExecutionPlanCaptureCallback.assertCapturedAndGpuFellBack(cpu_fallback_class_name, 2000) print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type, gpu_end - gpu_start, cpu_end - cpu_start)) if should_sort_locally(): from_cpu.sort(key=_RowCmp) from_gpu.sort(key=_RowCmp) assert_equal(from_cpu, from_gpu)
def run_on_gpu(): print('### GPU RUN ###') global gpu_start gpu_start = time.time() global from_gpu from_gpu = with_gpu_session(bring_back, conf=conf) global gpu_end gpu_end = time.time()
def test_explain_bucketd_scan(spark_tmp_table_factory): """ Test the physical plan includes the info of enabling bucketed scan. The code is copied from: https://github.com/apache/spark/commit/79515e4b6c#diff-03f119698c3637b87c9ce2634c34c14bb0f7efc043ea37a0891c1ab9fbc3ebadR688 """ def do_explain(spark): tbl_1 = spark_tmp_table_factory.get() tbl_2 = spark_tmp_table_factory.get() spark.createDataFrame([(1, 2), (2, 3)], ("i", "j")).write.bucketBy(8, "i").saveAsTable(tbl_1) spark.createDataFrame([(2,), (3,)], ("i",)).write.bucketBy(8, "i").saveAsTable(tbl_2) df1 = spark.table(tbl_1) df2 = spark.table(tbl_2) joined_df = df1.join(df2, df1.i == df2.i , "inner") assert "Bucketed: true" in joined_df._sc._jvm.PythonSQLUtils.explainString(joined_df._jdf.queryExecution(), "simple") with_gpu_session(do_explain, {"spark.sql.autoBroadcastJoinThreshold": "0"})
def test_catch_int96_overflow(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, {'spark.sql.parquet.outputTimestampType': 'INT96'}) assert_py4j_exception( lambda: with_gpu_session(lambda spark: unary_op_df(spark, data_gen). coalesce(1).write.parquet(data_path), conf=confs), "org.apache.spark.SparkException: Job aborted.")
def assert_gpu_and_cpu_error(df_fun, conf, error_message): """ Assert that GPU and CPU execution results in a specific Java exception thrown :param df_fun: a function to be verified :param conf: Spark config :param error_message: a string such as the one produce by java.lang.Exception.toString :return: Assertion failure if either GPU or CPU versions has not generated error messages expected """ assert_py4j_exception(lambda: with_cpu_session(df_fun, conf), error_message) assert_py4j_exception(lambda: with_gpu_session(df_fun, conf), error_message)
def test_json_read_with_corrupt_files(spark_tmp_path, v1_enabled_list): first_data_path = spark_tmp_path + '/JSON_DATA/first' with_cpu_session( lambda spark: spark.range(1).toDF("a").write.json(first_data_path)) second_data_path = spark_tmp_path + '/JSON_DATA/second' with_cpu_session( lambda spark: spark.range(1, 2).toDF("a").write.orc(second_data_path)) third_data_path = spark_tmp_path + '/JSON_DATA/third' with_cpu_session( lambda spark: spark.range(2, 3).toDF("a").write.json(third_data_path)) all_confs = copy_and_update( _enable_all_types_conf, { 'spark.sql.files.ignoreCorruptFiles': "true", 'spark.sql.sources.useV1SourceList': v1_enabled_list }) schema = StructType([StructField("a", IntegerType())]) # when ignoreCorruptFiles is enabled, gpu reading should not throw exception, while CPU can successfully # read the three files without ignore corrupt files. So we just check if GPU will throw exception. with_gpu_session(lambda spark: spark.read.schema(schema).json( [first_data_path, second_data_path, third_data_path]).collect(), conf=all_confs)
def _assert_cpu_gpu(cpu_func, gpu_func, cpu_conf={}, gpu_conf={}, is_sort=False): print('### CPU RUN ###') cpu_start = time.time() cpu_ret = with_cpu_session(cpu_func, conf=cpu_conf) cpu_end = time.time() print('### GPU RUN ###') gpu_start = time.time() gpu_ret = with_gpu_session(gpu_func, conf=gpu_conf) gpu_end = time.time() print('### WRITE: GPU TOOK {} CPU TOOK {} ###'.format( gpu_end - gpu_start, cpu_end - cpu_start)) if is_sort: assert cpu_ret.sort() == gpu_ret.sort() else: assert cpu_ret == gpu_ret
def test_cache_posexplode_makearray(spark_tmp_path, data_gen): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU' data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU' def write_posExplode(data_path): def posExplode(spark): cached = four_op_df(spark, data_gen).selectExpr( 'posexplode(array(b, c, d))', 'a').cache() cached.count() cached.write.parquet(data_path) spark.read.parquet(data_path) return posExplode from_cpu = with_cpu_session(write_posExplode(data_path_cpu)) from_gpu = with_gpu_session(write_posExplode(data_path_gpu)) assert_equal(from_cpu, from_gpu)
def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write): if is_spark_300() and data_gen.data_type == BooleanType(): pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672") data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU' data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU' def write_posExplode(data_path): def posExplode(spark): cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache() cached.count() cached.write.parquet(data_path) spark.read.parquet(data_path) return posExplode from_cpu = with_cpu_session(write_posExplode(data_path_cpu), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) from_gpu = with_gpu_session(write_posExplode(data_path_gpu), conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_write}) assert_equal(from_cpu, from_gpu)
def _assert_gpu_and_cpu_are_equal(func, should_collect, conf={}): (bring_back, collect_type) = _prep_func_for_compare(func, should_collect) conf = _prep_incompat_conf(conf) print('### CPU RUN ###') cpu_start = time.time() from_cpu = with_cpu_session(bring_back, conf=conf) cpu_end = time.time() print('### GPU RUN ###') gpu_start = time.time() from_gpu = with_gpu_session(bring_back, conf=conf) gpu_end = time.time() print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type, gpu_end - gpu_start, cpu_end - cpu_start)) if should_sort_locally(): from_cpu.sort(key=_RowCmp) from_gpu.sort(key=_RowCmp) assert_equal(from_cpu, from_gpu)