Example #1
0
def Run(benchmark_spec):
    """Runs a sequence of Spark SQL Query.

  Args:
    benchmark_spec: Spec needed to run the Spark SQL.

  Returns:
    A list of samples, comprised of the detailed run times of individual query.

  Raises:
    Benchmarks.RunError if no query succeeds.
  """
    dpb_service_instance = benchmark_spec.dpb_service
    metadata = benchmark_spec.dpb_service.GetMetadata()

    metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query]

    results = []
    failing_queries = []
    run_times = {}
    wall_times = {}
    for query in benchmark_spec.queries:
        try:
            result = _RunSparkSqlJob(
                dpb_service_instance,
                os.path.join(benchmark_spec.base_dir, query + '.sql'),
                os.path.join(benchmark_spec.base_dir, SPARK_SQL_RUNNER_SCRIPT),
                benchmark_spec.table_subdirs)
            logging.info(result)
            metadata_copy = metadata.copy()
            metadata_copy['query'] = query
            results.append(
                sample.Sample('sparksql_wall_time', result.wall_time,
                              'seconds', metadata_copy))
            results.append(
                sample.Sample('sparksql_run_time', result.run_time, 'seconds',
                              metadata_copy))
            wall_times[query] = result.wall_time
            run_times[query] = result.run_time
        except dpb_service.JobSubmissionError:
            failing_queries.append(query)

    metadata['failing_queries'] = ','.join(sorted(failing_queries))

    if results:
        results.append(
            sample.Sample(
                'sparksql_total_wall_time',
                np.fromiter(wall_times.values(), dtype='float').sum(),
                'seconds', metadata))
        results.append(
            sample.Sample('sparksql_total_run_time',
                          np.fromiter(run_times.values(), dtype='float').sum(),
                          'seconds', metadata))
        results.append(
            sample.Sample('sparksql_geomean_wall_time',
                          sample.GeoMean(wall_times.values()), 'seconds',
                          metadata))
        results.append(
            sample.Sample('sparksql_geomean_run_time',
                          sample.GeoMean(run_times.values()), 'seconds',
                          metadata))
    else:
        raise errors.Benchmarks.RunError('No queries succeeded.')

    return results
def Run(benchmark_spec):
    """Runs a sequence of Spark SQL Query.

  Args:
    benchmark_spec: Spec needed to run the Spark SQL.

  Returns:
    A list of samples, comprised of the detailed run times of individual query.

  Raises:
    Benchmarks.RunError if no query succeeds.
  """
    dpb_service_instance = benchmark_spec.dpb_service
    metadata = benchmark_spec.dpb_service.GetMetadata()

    metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query]

    results = []
    unit = 'seconds'
    failing_queries = []
    run_times = {}
    wall_times = {}
    for query in FLAGS.dpb_sparksql_order:
        stats = _RunSparkSqlJob(
            dpb_service_instance,
            os.path.join(benchmark_spec.base_dir, query + '.sql'),
            os.path.join(benchmark_spec.base_dir, SPARK_SQL_RUNNER_SCRIPT))
        logging.info(stats)
        metadata_copy = metadata.copy()
        metadata_copy['query'] = query
        if stats[dpb_service.SUCCESS]:
            run_time = stats[dpb_service.RUNTIME]
            wall_time = run_time + stats[dpb_service.WAITING]
            results.append(
                sample.Sample('sparksql_wall_time', wall_time, unit,
                              metadata_copy))
            results.append(
                sample.Sample('sparksql_run_time', run_time, unit,
                              metadata_copy))
            wall_times[query] = wall_time
            run_times[query] = run_time
        else:
            failing_queries.append(query)

    metadata['failing_queries'] = ','.join(sorted(failing_queries))

    if results:
        results.append(
            sample.Sample(
                'sparksql_total_wall_time',
                np.fromiter(wall_times.values(), dtype='float').sum(), unit,
                metadata))
        results.append(
            sample.Sample('sparksql_total_run_time',
                          np.fromiter(run_times.values(), dtype='float').sum(),
                          unit, metadata))
        results.append(
            sample.Sample('sparksql_geomean_wall_time',
                          sample.GeoMean(wall_times.values()), unit, metadata))
        results.append(
            sample.Sample('sparksql_geomean_run_time',
                          sample.GeoMean(run_times.values()), unit, metadata))
    else:
        raise errors.Benchmarks.RunError('No queries succeeded.')

    return results
Example #3
0
def Run(benchmark_spec):
    """Runs a sequence of Spark SQL Query.

  Args:
    benchmark_spec: Spec needed to run the Spark SQL.

  Returns:
    A list of samples, comprised of the detailed run times of individual query.

  Raises:
    Benchmarks.RunError if no query succeeds.
  """
    dpb_service_instance = benchmark_spec.dpb_service
    storage_service = dpb_service_instance.storage_service
    metadata = benchmark_spec.dpb_service.GetMetadata()

    metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query]

    # Run PySpark Spark SQL Runner
    report_dir = os.path.join(benchmark_spec.base_dir, 'report')
    args = [
        '--sql-scripts',
        ','.join(benchmark_spec.staged_queries),
        '--report-dir',
        report_dir,
    ]
    table_metadata_file = _GetStagedTableMetadata(storage_service,
                                                  benchmark_spec)
    if table_metadata_file:
        args += ['--table-metadata', table_metadata_file]
    jars = []
    if FLAGS.spark_bigquery_connector:
        jars.append(FLAGS.spark_bigquery_connector)
    job_result = dpb_service_instance.SubmitJob(
        pyspark_file=os.path.join(benchmark_spec.base_dir,
                                  SPARK_SQL_RUNNER_SCRIPT),
        job_arguments=args,
        job_jars=jars,
        job_type=dpb_service.BaseDpbService.PYSPARK_JOB_TYPE)

    # Spark can only write data to directories not files. So do a recursive copy
    # of that directory and then search it for the single JSON file with the
    # results.
    temp_run_dir = temp_dir.GetRunDirPath()
    storage_service.Copy(report_dir, temp_run_dir, recursive=True)
    report_file = None
    for dir_name, _, files in os.walk(os.path.join(temp_run_dir, 'report')):
        for filename in files:
            if filename.endswith('.json'):
                report_file = os.path.join(dir_name, filename)
                logging.info(report_file)
    if not report_file:
        raise errors.Benchmarks.RunError('Job report not found.')

    results = []
    run_times = {}
    passing_queries = set()
    with open(report_file, 'r') as file:
        for line in file:
            result = json.loads(line)
            logging.info('Timing: %s', result)
            query_id = _GetQueryId(result['script'])
            assert query_id
            passing_queries.add(query_id)
            metadata_copy = metadata.copy()
            metadata_copy['query'] = query_id
            results.append(
                sample.Sample('sparksql_run_time', result['duration'],
                              'seconds', metadata_copy))
            run_times[query_id] = result['duration']

    metadata['failing_queries'] = ','.join(
        sorted(set(FLAGS.dpb_sparksql_order) - passing_queries))

    results.append(
        sample.Sample('sparksql_total_wall_time', job_result.wall_time,
                      'seconds', metadata))
    results.append(
        sample.Sample('sparksql_geomean_run_time',
                      sample.GeoMean(run_times.values()), 'seconds', metadata))
    return results
def Run(benchmark_spec):
    """Runs a sequence of Spark SQL Query.

  Args:
    benchmark_spec: Spec needed to run the Spark SQL.

  Returns:
    A list of samples, comprised of the detailed run times of individual query.

  Raises:
    Benchmarks.RunError if no query succeeds.
  """
    cluster = benchmark_spec.dpb_service
    storage_service = cluster.storage_service
    metadata = benchmark_spec.dpb_service.GetMetadata()

    metadata['benchmark'] = BENCHMARK_NAMES[FLAGS.dpb_sparksql_query]
    if FLAGS.bigquery_record_format:
        # This takes higher priority since for BQ dpb_sparksql_data_format actually
        # holds a fully qualified Java class/package name.
        metadata['data_format'] = FLAGS.bigquery_record_format
    elif FLAGS.dpb_sparksql_data_format:
        metadata['data_format'] = FLAGS.dpb_sparksql_data_format
    if FLAGS.dpb_sparksql_data_compression:
        metadata['data_compression'] = FLAGS.dpb_sparksql_data_compression

    # Run PySpark Spark SQL Runner
    report_dir = '/'.join(
        [cluster.base_dir, f'report-{int(time.time()*1000)}'])
    args = [
        '--sql-scripts',
        ','.join(benchmark_spec.staged_queries),
        '--report-dir',
        report_dir,
    ]
    if FLAGS.dpb_sparksql_database:
        args += ['--database', FLAGS.dpb_sparksql_database]
    table_metadata = _GetTableMetadata(benchmark_spec)
    if table_metadata:
        table_metadata_file = '/'.join([cluster.base_dir, 'metadata.json'])
        _StageMetadata(table_metadata, storage_service, table_metadata_file)
        args += ['--table-metadata', table_metadata_file]
    else:
        # If we don't pass in tables, we must be reading from hive.
        # Note you can even read from Hive without --create_hive_tables if they
        # were precreated.
        args += ['--enable-hive', 'True']
    if FLAGS.dpb_sparksql_table_cache:
        args += ['--table-cache', FLAGS.dpb_sparksql_table_cache]
    if FLAGS.dpb_sparksql_simultaneous:
        args += ['--simultaneous', 'True']
    jars = []
    if FLAGS.spark_bigquery_connector:
        jars.append(FLAGS.spark_bigquery_connector)
    job_result = cluster.SubmitJob(
        pyspark_file='/'.join([cluster.base_dir, SPARK_SQL_RUNNER_SCRIPT]),
        job_arguments=args,
        job_jars=jars,
        job_type=dpb_service.BaseDpbService.PYSPARK_JOB_TYPE)

    # Spark can only write data to directories not files. So do a recursive copy
    # of that directory and then search it for the single JSON file with the
    # results.
    temp_run_dir = temp_dir.GetRunDirPath()
    storage_service.Copy(report_dir, temp_run_dir, recursive=True)
    report_file = None
    for dir_name, _, files in os.walk(
            os.path.join(temp_run_dir, os.path.basename(report_dir))):
        for filename in files:
            if filename.endswith('.json'):
                report_file = os.path.join(dir_name, filename)
                logging.info(report_file)
    if not report_file:
        raise errors.Benchmarks.RunError('Job report not found.')

    results = []
    run_times = {}
    passing_queries = set()
    with open(report_file, 'r') as file:
        for line in file:
            result = json.loads(line)
            logging.info('Timing: %s', result)
            query_id = _GetQueryId(result['script'])
            assert query_id
            passing_queries.add(query_id)
            metadata_copy = metadata.copy()
            metadata_copy['query'] = query_id
            results.append(
                sample.Sample('sparksql_run_time', result['duration'],
                              'seconds', metadata_copy))
            run_times[query_id] = result['duration']

    metadata['failing_queries'] = ','.join(
        sorted(set(FLAGS.dpb_sparksql_order) - passing_queries))

    results.append(
        sample.Sample('sparksql_total_wall_time', job_result.wall_time,
                      'seconds', metadata))
    results.append(
        sample.Sample('sparksql_geomean_run_time',
                      sample.GeoMean(run_times.values()), 'seconds', metadata))
    cluster_create_time = cluster.GetClusterCreateTime()
    if cluster_create_time is not None:
        results.append(
            sample.Sample('dpb_cluster_create_time', cluster_create_time,
                          'seconds', metadata))
    results.append(
        sample.Sample('dpb_sparksql_job_pending', job_result.pending_time,
                      'seconds', metadata))
    return results