def Cleanup(benchmark_spec):
    storage_service = object_storage_service.GetObjectStorageClass(
        FLAGS.cloud)()
    base_folder = benchmark_spec.uuid.split('-')[0]
    for lifecycle_step, _ in RESOURCE_LIFECYCLE_ARTIFACTS.items():
        dml_script_folder = '{}_{}'.format(base_folder, lifecycle_step)
        storage_service.DeleteBucket(dml_script_folder)
def Prepare(benchmark_spec):
    """Prepare vm with cloud provider tool and prepare vm with data file.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """

    providers.LoadProvider(FLAGS.storage)

    service = object_storage_service.GetObjectStorageClass(FLAGS.storage)()
    service.PrepareService(FLAGS.object_storage_region)

    vms = benchmark_spec.vms
    vm_util.RunThreaded(lambda vm: PrepareVM(vm, service), vms)

    # We would like to always cleanup server side states when exception happens.
    benchmark_spec.always_call_cleanup = True

    # Make the bucket(s)
    bucket_name = 'pkb%s' % FLAGS.run_uri
    if FLAGS.storage != 'GCP' or not FLAGS.object_storage_gcs_multiregion:
        service.MakeBucket(bucket_name)
    else:
        # Use a GCS multiregional bucket
        multiregional_service = gcs.GoogleCloudStorageService()
        multiregional_service.PrepareService(
            FLAGS.object_storage_gcs_multiregion or DEFAULT_GCS_MULTIREGION)
        multiregional_service.MakeBucket(bucket_name)

    # Save the service and the buckets for later
    benchmark_spec.service = service
    benchmark_spec.buckets = [bucket_name]
Exemple #3
0
def _GetService() -> object_storage_service.ObjectStorageService:
    """Get a ready to use instance of ObjectStorageService."""
    # TODO(pclay): consider using FLAGS.storage to allow cross cloud testing?
    cloud = FLAGS.cloud
    providers.LoadProvider(cloud)
    service = object_storage_service.GetObjectStorageClass(cloud)()
    # This method is idempotent with default args and safe to call in each phase.
    service.PrepareService(FLAGS.object_storage_region)
    return service
def Prepare(benchmark_spec):
  """Installs and sets up dataset on the Spark clusters.

  Copies scripts and all the queries to cloud.
  Creates external Hive tables for data (unless BigQuery is being used).

  Args:
    benchmark_spec: The benchmark specification
  """
  dpb_service_instance = benchmark_spec.dpb_service
  run_uri = benchmark_spec.uuid.split('-')[0]
  dpb_service_instance.CreateBucket(run_uri)

  temp_run_dir = temp_dir.GetRunDirPath()
  spark_sql_perf_dir = os.path.join(temp_run_dir, 'spark_sql_perf_dir')
  vm_util.IssueCommand(['git', 'clone', SPARK_SQL_PERF_GIT, spark_sql_perf_dir])
  vm_util.IssueCommand(['git', 'checkout', SPARK_SQL_PERF_GIT_COMMIT],
                       cwd=spark_sql_perf_dir)
  query_dir = os.path.join(spark_sql_perf_dir, 'src', 'main', 'resources',
                           FLAGS.dpb_sparksql_query)

  storage_service = object_storage_service.GetObjectStorageClass(FLAGS.cloud)()
  dst_url = '{prefix}{uri}'.format(
      prefix=dpb_service_instance.PERSISTENT_FS_PREFIX, uri=run_uri)
  for dir_name, _, files in os.walk(query_dir):
    for filename in files:
      match = re.match(r'q?([0-9]+)a?.sql', filename)
      if match:
        query_id = match.group(1)
        # if order is specified only upload those queries
        if not FLAGS.dpb_sparksql_order or query_id in FLAGS.dpb_sparksql_order:
          query = '{}.sql'.format(query_id)
          src_url = os.path.join(dir_name, filename)
          storage_service.Copy(src_url, os.path.join(dst_url, query))
  for script in [SPARK_TABLE_SCRIPT, SPARK_SQL_RUNNER_SCRIPT]:
    src_url = data.ResourcePath(script)
    storage_service.Copy(src_url, dst_url)
  benchmark_spec.base_dir = dst_url

  # Create external Hive tables if not reading the data from BigQuery
  if FLAGS.dpb_sparksql_data:
    stdout = storage_service.List(FLAGS.dpb_sparksql_data)

    for table_dir in stdout.split('\n'):
      # The directory name is the table name.
      if not table_dir:
        continue
      table = re.split(' |/', table_dir.rstrip('/')).pop()
      stats = dpb_service_instance.SubmitJob(
          pyspark_file=os.path.join(dst_url, SPARK_TABLE_SCRIPT),
          job_type=BaseDpbService.PYSPARK_JOB_TYPE,
          job_arguments=[FLAGS.dpb_sparksql_data, table])
      logging.info(stats)
      if not stats['success']:
        logging.warning('Creates table %s from %s failed', table, table_dir)
def Prepare(benchmark_spec):
    """Prepare vm with cloud provider tool and prepare vm with data file.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.
  """

    providers.LoadProvider(FLAGS.storage)

    service = object_storage_service.GetObjectStorageClass(FLAGS.storage)()
    service.PrepareService(FLAGS.object_storage_region)

    vms = benchmark_spec.vms
    for vm in vms:
        PrepareVM(vm, service)
        service.PrepareVM(vm)

    # We would like to always cleanup server side states when exception happens.
    benchmark_spec.always_call_cleanup = True

    # Make the bucket(s)
    bucket_name = 'pkb%s' % FLAGS.run_uri
    if FLAGS.storage != 'GCP':
        service.MakeBucket(bucket_name)
        buckets = [bucket_name]
    else:
        # TODO(nlavine): make GCP bucket name handling match other
        # providers. Leaving it inconsistent for now to match previous
        # behavior, but should change it after a reasonable deprecation
        # period.
        multiregional_service = gcs.GoogleCloudStorageService()
        multiregional_service.PrepareService(
            FLAGS.object_storage_gcs_multiregion or DEFAULT_GCS_MULTIREGION)
        multiregional_service.MakeBucket(bucket_name)

        region = FLAGS.object_storage_region or gcs.DEFAULT_GCP_REGION
        regional_bucket_name = 'pkb%s-%s' % (FLAGS.run_uri, region)
        regional_service = gcs.GoogleCloudStorageService()
        regional_service.PrepareService(region)
        regional_service.MakeBucket(regional_bucket_name)
        buckets = [bucket_name, regional_bucket_name]

    # Save the service and the buckets for later
    benchmark_spec.service = service
    benchmark_spec.buckets = buckets
Exemple #6
0
def Prepare(benchmark_spec):
  """Prepare phase uses schema creation script and sample data to prepare table.

  Args:
    benchmark_spec: Configuration that holds the definition and instance details
      of the resources used for benchmarking.
  """
  storage_service = object_storage_service.GetObjectStorageClass(FLAGS.cloud)()
  dpb_service_instance = benchmark_spec.dpb_service
  run_uri = benchmark_spec.uuid.split('-')[0]
  uri_map = ManageLifecycleResources(run_uri, dpb_service_instance,
                                     storage_service)
  dml_script_uri = uri_map['dml_script']
  data_folder_uri = uri_map['data']
  stats = dpb_service_instance.SubmitJob(
      pyspark_file=dml_script_uri,
      job_type=BaseDpbService.PYSPARK_JOB_TYPE,
      job_arguments=[data_folder_uri])
  logging.info(stats)
  if not stats['success']:
    logging.warning('Table Creation Failed')