Example #1
0
    def test_unsupported_type_display_data(self):
        class MyDisplayComponent(HasDisplayData):
            def display_data(self):
                return {'item_key': 'item_value'}

        with self.assertRaises(ValueError):
            DisplayData.create_from_options(MyDisplayComponent())
  def test_unsupported_type_display_data(self):
    class MyDisplayComponent(HasDisplayData):
      def display_data(self):
        return {'item_key': 'item_value'}

    with self.assertRaises(ValueError):
      DisplayData.create_from_options(MyDisplayComponent())
Example #3
0
    def test_value_provider_display_data(self):
        class TestOptions(PipelineOptions):
            @classmethod
            def _add_argparse_args(cls, parser):
                parser.add_value_provider_argument('--int_flag',
                                                   type=int,
                                                   help='int_flag description')
                parser.add_value_provider_argument('--str_flag',
                                                   type=str,
                                                   default='hello',
                                                   help='str_flag description')
                parser.add_value_provider_argument(
                    '--float_flag', type=float, help='float_flag description')

        options = TestOptions(['--int_flag', '1'])
        items = DisplayData.create_from_options(options).items
        expected_items = [
            DisplayDataItemMatcher('int_flag', '1'),
            DisplayDataItemMatcher(
                'str_flag', 'RuntimeValueProvider(option: str_flag,'
                ' type: str, default_value: \'hello\')'),
            DisplayDataItemMatcher(
                'float_flag', 'RuntimeValueProvider(option: float_flag,'
                ' type: float, default_value: None)')
        ]
        hc.assert_that(items, hc.contains_inanyorder(*expected_items))
Example #4
0
 def test_create_list_display_data(self):
   flags = ['--extra_package', 'package1', '--extra_package', 'package2']
   pipeline_options = PipelineOptions(flags=flags)
   items = DisplayData.create_from_options(pipeline_options).items
   hc.assert_that(items, hc.contains_inanyorder(
       DisplayDataItemMatcher('extra_packages',
                              str(['package1', 'package2']))))
 def test_value_provider_display_data(self):
   class TestOptions(PipelineOptions):
     @classmethod
     def _add_argparse_args(cls, parser):
       parser.add_value_provider_argument(
           '--int_flag',
           type=int,
           help='int_flag description')
       parser.add_value_provider_argument(
           '--str_flag',
           type=str,
           default='hello',
           help='str_flag description')
       parser.add_value_provider_argument(
           '--float_flag',
           type=float,
           help='float_flag description')
   options = TestOptions(['--int_flag', '1'])
   items = DisplayData.create_from_options(options).items
   expected_items = [
       DisplayDataItemMatcher(
           'int_flag',
           '1'),
       DisplayDataItemMatcher(
           'str_flag',
           'RuntimeValueProvider(option: str_flag,'
           ' type: str, default_value: \'hello\')'
       ),
       DisplayDataItemMatcher(
           'float_flag',
           'RuntimeValueProvider(option: float_flag,'
           ' type: float, default_value: None)'
       )
   ]
   hc.assert_that(items, hc.contains_inanyorder(*expected_items))
Example #6
0
  def __init__(self, packages, options, environment_version):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    sdk_name, version_string = get_sdk_name_and_version()

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(sdk_name)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'PYTHON_STREAMING'
    else:
      job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))
    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      # Default to using the worker harness container image for the current SDK
      # version.
      pool.workerHarnessContainerImage = (
          'dataflow.gcr.io/v1beta3/python:%s' %
          get_required_container_version())
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.iteritems()
                      if v is not None}
      options_dict['_options_id'] = options._options_id
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))
Example #7
0
  def __init__(self, packages, options, environment_version, pipeline_url):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.pipeline_url = pipeline_url
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(names.BEAM_SDK_NAME)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(beam_version.__version__))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'FNAPI_STREAMING'
    else:
      if _use_fnapi(options):
        job_type = 'FNAPI_BATCH'
      else:
        job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # TODO: Use enumerated type instead of strings for job types.
    if job_type.startswith('FNAPI_'):
      runner_harness_override = (
          get_runner_harness_container_image())
      self.debug_options.experiments = self.debug_options.experiments or []
      if runner_harness_override:
        self.debug_options.experiments.append(
            'runner_harness_container_image=' + runner_harness_override)
      # Add use_multiple_sdk_containers flag if its not already present. Do not
      # add the flag if 'no_use_multiple_sdk_containers' is present.
      # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
      # till version 2.4.
      if ('use_multiple_sdk_containers' not in self.proto.experiments and
          'no_use_multiple_sdk_containers' not in self.proto.experiments):
        self.debug_options.experiments.append('use_multiple_sdk_containers')
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))

    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.subnetwork:
      pool.subnetwork = self.worker_options.subnetwork
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      pool.workerHarnessContainerImage = (
          get_default_container_image_for_current_sdk(job_type))
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.items()
                      if v is not None}
      options_dict["pipelineUrl"] = pipeline_url
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))
Example #8
0
    def __init__(self, packages, options, environment_version, pipeline_url):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.pipeline_url = pipeline_url
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(self._get_python_sdk_name())),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(beam_version.__version__))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        _verify_interpreter_version_is_supported(options)
        if self.standard_options.streaming:
            job_type = 'FNAPI_STREAMING'
        else:
            if _use_fnapi(options):
                job_type = 'FNAPI_BATCH'
            else:
                job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # TODO: Use enumerated type instead of strings for job types.
        if job_type.startswith('FNAPI_'):
            runner_harness_override = (get_runner_harness_container_image())
            self.debug_options.experiments = self.debug_options.experiments or []
            if runner_harness_override:
                self.debug_options.experiments.append(
                    'runner_harness_container_image=' +
                    runner_harness_override)
            # Add use_multiple_sdk_containers flag if its not already present. Do not
            # add the flag if 'no_use_multiple_sdk_containers' is present.
            # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
            # till version 2.4.
            debug_options_experiments = self.debug_options.experiments
            if ('use_multiple_sdk_containers' not in debug_options_experiments
                    and 'no_use_multiple_sdk_containers'
                    not in debug_options_experiments):
                self.debug_options.experiments.append(
                    'use_multiple_sdk_containers')
        # FlexRS
        if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_COST_OPTIMIZED)
        elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_SPEED_OPTIMIZED)
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))

        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.subnetwork:
            pool.subnetwork = self.worker_options.subnetwork
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            pool.workerHarnessContainerImage = (
                get_default_container_image_for_current_sdk(job_type))
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.items() if v is not None
            }
            options_dict["pipelineUrl"] = pipeline_url
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
Example #9
0
    def __init__(self, packages, options, environment_version):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        sdk_name, version_string = get_sdk_name_and_version()

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(sdk_name)),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'PYTHON_STREAMING'
        else:
            job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))
        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            # Default to using the worker harness container image for the current SDK
            # version.
            pool.workerHarnessContainerImage = (
                'dataflow.gcr.io/v1beta3/python:%s' %
                get_required_container_version())
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.iteritems() if v is not None
            }
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))