def do_test_query(self, query): spark = get_spark_i_know_what_i_am_doing() jvm_session = _get_jvm_session(spark) jvm = _get_jvm(spark) tests = { "q1": jvm.com.nvidia.spark.rapids.tests.tpch.Q1Like, "q2": jvm.com.nvidia.spark.rapids.tests.tpch.Q2Like, "q3": jvm.com.nvidia.spark.rapids.tests.tpch.Q3Like, "q4": jvm.com.nvidia.spark.rapids.tests.tpch.Q4Like, "q5": jvm.com.nvidia.spark.rapids.tests.tpch.Q5Like, "q6": jvm.com.nvidia.spark.rapids.tests.tpch.Q6Like, "q7": jvm.com.nvidia.spark.rapids.tests.tpch.Q7Like, "q8": jvm.com.nvidia.spark.rapids.tests.tpch.Q8Like, "q9": jvm.com.nvidia.spark.rapids.tests.tpch.Q9Like, "q10": jvm.com.nvidia.spark.rapids.tests.tpch.Q10Like, "q11": jvm.com.nvidia.spark.rapids.tests.tpch.Q11Like, "q12": jvm.com.nvidia.spark.rapids.tests.tpch.Q12Like, "q13": jvm.com.nvidia.spark.rapids.tests.tpch.Q13Like, "q14": jvm.com.nvidia.spark.rapids.tests.tpch.Q14Like, "q15": jvm.com.nvidia.spark.rapids.tests.tpch.Q15Like, "q16": jvm.com.nvidia.spark.rapids.tests.tpch.Q16Like, "q17": jvm.com.nvidia.spark.rapids.tests.tpch.Q17Like, "q18": jvm.com.nvidia.spark.rapids.tests.tpch.Q18Like, "q19": jvm.com.nvidia.spark.rapids.tests.tpch.Q19Like, "q20": jvm.com.nvidia.spark.rapids.tests.tpch.Q20Like, "q21": jvm.com.nvidia.spark.rapids.tests.tpch.Q21Like, "q22": jvm.com.nvidia.spark.rapids.tests.tpch.Q22Like } df = tests.get(query).apply(jvm_session) return DataFrame(df, spark.getActiveSession())
def do_test_query(self, query): spark = get_spark_i_know_what_i_am_doing() jvm_session = _get_jvm_session(spark) jvm = _get_jvm(spark) df = jvm.com.nvidia.spark.rapids.tests.tpcds.TpcdsLikeSpark.run( jvm_session, query) return DataFrame(df, spark.getActiveSession())
def is_databricks_version_or_later(major, minor): spark = get_spark_i_know_what_i_am_doing() version = spark.conf.get("spark.databricks.clusterUsageTags.sparkVersion", "0.0") parts = version.split(".") if (len(parts) < 2): raise RuntimeError("Unable to determine Databricks version from version string: " + version) return int(parts[0]) >= major and int(parts[1]) >= minor
def spark_tmp_table_factory(request): base_id = 'tmp_table_{}'.format(random.randint(0, 1000000)) yield TmpTableFactory(base_id) sp = get_spark_i_know_what_i_am_doing() tables = sp.sql("SHOW TABLES".format(base_id)).collect() for row in tables: t_name = row['tableName'] if (t_name.startswith(base_id)): sp.sql("DROP TABLE IF EXISTS {}".format(t_name))
def spark_tmp_table_factory(request): worker_id = get_worker_id(request) table_id = random.getrandbits(31) base_id = f'tmp_table_{worker_id}_{table_id}' yield TmpTableFactory(base_id) sp = get_spark_i_know_what_i_am_doing() tables = sp.sql("SHOW TABLES".format(base_id)).collect() for row in tables: t_name = row['tableName'] if (t_name.startswith(base_id)): sp.sql("DROP TABLE IF EXISTS {}".format(t_name))
def do_test_query(self, query): spark = get_spark_i_know_what_i_am_doing() jvm_session = _get_jvm_session(spark) jvm = _get_jvm(spark) tests = { "q5": jvm.com.nvidia.spark.rapids.tests.tpcxbb.Q5Like, "q16": jvm.com.nvidia.spark.rapids.tests.tpcxbb.Q16Like, "q21": jvm.com.nvidia.spark.rapids.tests.tpcxbb.Q21Like, "q22": jvm.com.nvidia.spark.rapids.tests.tpcxbb.Q22Like } df = tests.get(query).apply(jvm_session) return DataFrame(df, spark.getActiveSession())
def spark_tmp_path(request): debug = request.config.getoption('debug_tmp_path') ret = request.config.getoption('tmp_path') if ret is None: ret = '/tmp/pyspark_tests/' ret = ret + '/' + str(random.randint(0, 1000000)) + '/' # Make sure it is there and accessible sc = get_spark_i_know_what_i_am_doing().sparkContext config = sc._jsc.hadoopConfiguration() path = sc._jvm.org.apache.hadoop.fs.Path(ret) fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config) fs.mkdirs(path) yield ret if not debug: fs.delete(path)
def spark_tmp_path(request): debug = request.config.getoption('debug_tmp_path') ret = request.config.getoption('tmp_path') if ret is None: ret = '/tmp/pyspark_tests/' worker_id = get_worker_id(request) pid = os.getpid() hostname = os.uname()[1] ret = f'{ret}/{hostname}-{worker_id}-{pid}-{random.randrange(0, 1<<31)}/' # Make sure it is there and accessible sc = get_spark_i_know_what_i_am_doing().sparkContext config = sc._jsc.hadoopConfiguration() path = sc._jvm.org.apache.hadoop.fs.Path(ret) fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config) fs.mkdirs(path) yield ret if not debug: fs.delete(path)
from conftest import is_allowing_any_non_gpu, get_non_gpu_allowed, get_validate_execs_in_gpu_plan from pyspark.sql import SparkSession, DataFrame from spark_init_internal import get_spark_i_know_what_i_am_doing, spark_version def _from_scala_map(scala_map): ret = {} # The value we get is a scala map, not a java map, so we need to jump through some hoops keys = scala_map.keys().iterator() while keys.hasNext(): key = keys.next() ret[key] = scala_map.get(key).get() return ret _spark = get_spark_i_know_what_i_am_doing() # Have to reach into a private member to get access to the API we need _orig_conf = _from_scala_map(_spark.conf._jconf.getAll()) _orig_conf_keys = _orig_conf.keys() def is_tz_utc(spark=_spark): """ true if the tz is UTC else false """ # Now we have to do some kind of ugly internal java stuff jvm = spark.sparkContext._jvm utc = jvm.java.time.ZoneId.of('UTC').normalized() sys_tz = jvm.java.time.ZoneId.systemDefault().normalized() return utc == sys_tz
def __init__(self, tpcds_format, tpcds_path): self.tpcds_format = tpcds_format self.tpcds_path = tpcds_path self.setup(get_spark_i_know_what_i_am_doing())
def spark_jvm(): return _get_jvm(get_spark_i_know_what_i_am_doing())