def Run(benchmark_spec): """Runs a sequence of Spark SQL Query. Args: benchmark_spec: Spec needed to run the Spark SQL. Returns: A list of samples, comprised of the detailed run times of individual query. Raises: Benchmarks.RunError if no query succeeds. """ dpb_service_instance = benchmark_spec.dpb_service metadata = benchmark_spec.dpb_service.GetMetadata() metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query] results = [] failing_queries = [] run_times = {} wall_times = {} for query in benchmark_spec.queries: try: result = _RunSparkSqlJob( dpb_service_instance, os.path.join(benchmark_spec.base_dir, query + '.sql'), os.path.join(benchmark_spec.base_dir, SPARK_SQL_RUNNER_SCRIPT), benchmark_spec.table_subdirs) logging.info(result) metadata_copy = metadata.copy() metadata_copy['query'] = query results.append( sample.Sample('sparksql_wall_time', result.wall_time, 'seconds', metadata_copy)) results.append( sample.Sample('sparksql_run_time', result.run_time, 'seconds', metadata_copy)) wall_times[query] = result.wall_time run_times[query] = result.run_time except dpb_service.JobSubmissionError: failing_queries.append(query) metadata['failing_queries'] = ','.join(sorted(failing_queries)) if results: results.append( sample.Sample( 'sparksql_total_wall_time', np.fromiter(wall_times.values(), dtype='float').sum(), 'seconds', metadata)) results.append( sample.Sample('sparksql_total_run_time', np.fromiter(run_times.values(), dtype='float').sum(), 'seconds', metadata)) results.append( sample.Sample('sparksql_geomean_wall_time', sample.GeoMean(wall_times.values()), 'seconds', metadata)) results.append( sample.Sample('sparksql_geomean_run_time', sample.GeoMean(run_times.values()), 'seconds', metadata)) else: raise errors.Benchmarks.RunError('No queries succeeded.') return results
def Run(benchmark_spec): """Runs a sequence of Spark SQL Query. Args: benchmark_spec: Spec needed to run the Spark SQL. Returns: A list of samples, comprised of the detailed run times of individual query. Raises: Benchmarks.RunError if no query succeeds. """ dpb_service_instance = benchmark_spec.dpb_service metadata = benchmark_spec.dpb_service.GetMetadata() metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query] results = [] unit = 'seconds' failing_queries = [] run_times = {} wall_times = {} for query in FLAGS.dpb_sparksql_order: stats = _RunSparkSqlJob( dpb_service_instance, os.path.join(benchmark_spec.base_dir, query + '.sql'), os.path.join(benchmark_spec.base_dir, SPARK_SQL_RUNNER_SCRIPT)) logging.info(stats) metadata_copy = metadata.copy() metadata_copy['query'] = query if stats[dpb_service.SUCCESS]: run_time = stats[dpb_service.RUNTIME] wall_time = run_time + stats[dpb_service.WAITING] results.append( sample.Sample('sparksql_wall_time', wall_time, unit, metadata_copy)) results.append( sample.Sample('sparksql_run_time', run_time, unit, metadata_copy)) wall_times[query] = wall_time run_times[query] = run_time else: failing_queries.append(query) metadata['failing_queries'] = ','.join(sorted(failing_queries)) if results: results.append( sample.Sample( 'sparksql_total_wall_time', np.fromiter(wall_times.values(), dtype='float').sum(), unit, metadata)) results.append( sample.Sample('sparksql_total_run_time', np.fromiter(run_times.values(), dtype='float').sum(), unit, metadata)) results.append( sample.Sample('sparksql_geomean_wall_time', sample.GeoMean(wall_times.values()), unit, metadata)) results.append( sample.Sample('sparksql_geomean_run_time', sample.GeoMean(run_times.values()), unit, metadata)) else: raise errors.Benchmarks.RunError('No queries succeeded.') return results
def Run(benchmark_spec): """Runs a sequence of Spark SQL Query. Args: benchmark_spec: Spec needed to run the Spark SQL. Returns: A list of samples, comprised of the detailed run times of individual query. Raises: Benchmarks.RunError if no query succeeds. """ dpb_service_instance = benchmark_spec.dpb_service storage_service = dpb_service_instance.storage_service metadata = benchmark_spec.dpb_service.GetMetadata() metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query] # Run PySpark Spark SQL Runner report_dir = os.path.join(benchmark_spec.base_dir, 'report') args = [ '--sql-scripts', ','.join(benchmark_spec.staged_queries), '--report-dir', report_dir, ] table_metadata_file = _GetStagedTableMetadata(storage_service, benchmark_spec) if table_metadata_file: args += ['--table-metadata', table_metadata_file] jars = [] if FLAGS.spark_bigquery_connector: jars.append(FLAGS.spark_bigquery_connector) job_result = dpb_service_instance.SubmitJob( pyspark_file=os.path.join(benchmark_spec.base_dir, SPARK_SQL_RUNNER_SCRIPT), job_arguments=args, job_jars=jars, job_type=dpb_service.BaseDpbService.PYSPARK_JOB_TYPE) # Spark can only write data to directories not files. So do a recursive copy # of that directory and then search it for the single JSON file with the # results. temp_run_dir = temp_dir.GetRunDirPath() storage_service.Copy(report_dir, temp_run_dir, recursive=True) report_file = None for dir_name, _, files in os.walk(os.path.join(temp_run_dir, 'report')): for filename in files: if filename.endswith('.json'): report_file = os.path.join(dir_name, filename) logging.info(report_file) if not report_file: raise errors.Benchmarks.RunError('Job report not found.') results = [] run_times = {} passing_queries = set() with open(report_file, 'r') as file: for line in file: result = json.loads(line) logging.info('Timing: %s', result) query_id = _GetQueryId(result['script']) assert query_id passing_queries.add(query_id) metadata_copy = metadata.copy() metadata_copy['query'] = query_id results.append( sample.Sample('sparksql_run_time', result['duration'], 'seconds', metadata_copy)) run_times[query_id] = result['duration'] metadata['failing_queries'] = ','.join( sorted(set(FLAGS.dpb_sparksql_order) - passing_queries)) results.append( sample.Sample('sparksql_total_wall_time', job_result.wall_time, 'seconds', metadata)) results.append( sample.Sample('sparksql_geomean_run_time', sample.GeoMean(run_times.values()), 'seconds', metadata)) return results
def Run(benchmark_spec): """Runs a sequence of Spark SQL Query. Args: benchmark_spec: Spec needed to run the Spark SQL. Returns: A list of samples, comprised of the detailed run times of individual query. Raises: Benchmarks.RunError if no query succeeds. """ cluster = benchmark_spec.dpb_service storage_service = cluster.storage_service metadata = benchmark_spec.dpb_service.GetMetadata() metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query] if FLAGS.bigquery_record_format: # This takes higher priority since for BQ dpb_sparksql_data_format actually # holds a fully qualified Java class/package name. metadata['data_format'] = FLAGS.bigquery_record_format elif FLAGS.dpb_sparksql_data_format: metadata['data_format'] = FLAGS.dpb_sparksql_data_format if FLAGS.dpb_sparksql_data_compression: metadata['data_compression'] = FLAGS.dpb_sparksql_data_compression # Run PySpark Spark SQL Runner report_dir = '/'.join( [cluster.base_dir, f'report-{int(time.time()*1000)}']) args = [ '--sql-scripts', ','.join(benchmark_spec.staged_queries), '--report-dir', report_dir, ] if FLAGS.dpb_sparksql_database: args += ['--database', FLAGS.dpb_sparksql_database] table_metadata = _GetTableMetadata(benchmark_spec) if table_metadata: table_metadata_file = '/'.join([cluster.base_dir, 'metadata.json']) _StageMetadata(table_metadata, storage_service, table_metadata_file) args += ['--table-metadata', table_metadata_file] else: # If we don't pass in tables, we must be reading from hive. # Note you can even read from Hive without --create_hive_tables if they # were precreated. args += ['--enable-hive', 'True'] if FLAGS.dpb_sparksql_table_cache: args += ['--table-cache', FLAGS.dpb_sparksql_table_cache] if FLAGS.dpb_sparksql_simultaneous: args += ['--simultaneous', 'True'] jars = [] if FLAGS.spark_bigquery_connector: jars.append(FLAGS.spark_bigquery_connector) job_result = cluster.SubmitJob( pyspark_file='/'.join([cluster.base_dir, SPARK_SQL_RUNNER_SCRIPT]), job_arguments=args, job_jars=jars, job_type=dpb_service.BaseDpbService.PYSPARK_JOB_TYPE) # Spark can only write data to directories not files. So do a recursive copy # of that directory and then search it for the single JSON file with the # results. temp_run_dir = temp_dir.GetRunDirPath() storage_service.Copy(report_dir, temp_run_dir, recursive=True) report_file = None for dir_name, _, files in os.walk( os.path.join(temp_run_dir, os.path.basename(report_dir))): for filename in files: if filename.endswith('.json'): report_file = os.path.join(dir_name, filename) logging.info(report_file) if not report_file: raise errors.Benchmarks.RunError('Job report not found.') results = [] run_times = {} passing_queries = set() with open(report_file, 'r') as file: for line in file: result = json.loads(line) logging.info('Timing: %s', result) query_id = _GetQueryId(result['script']) assert query_id passing_queries.add(query_id) metadata_copy = metadata.copy() metadata_copy['query'] = query_id results.append( sample.Sample('sparksql_run_time', result['duration'], 'seconds', metadata_copy)) run_times[query_id] = result['duration'] metadata['failing_queries'] = ','.join( sorted(set(FLAGS.dpb_sparksql_order) - passing_queries)) results.append( sample.Sample('sparksql_total_wall_time', job_result.wall_time, 'seconds', metadata)) results.append( sample.Sample('sparksql_geomean_run_time', sample.GeoMean(run_times.values()), 'seconds', metadata)) cluster_create_time = cluster.GetClusterCreateTime() if cluster_create_time is not None: results.append( sample.Sample('dpb_cluster_create_time', cluster_create_time, 'seconds', metadata)) results.append( sample.Sample('dpb_sparksql_job_pending', job_result.pending_time, 'seconds', metadata)) return results