Exemple #1
0
    def __init__(self, packages, options, environment_version, pipeline_url):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.pipeline_url = pipeline_url
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        sdk_name, version_string = get_sdk_name_and_version()

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(sdk_name)),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'FNAPI_STREAMING'
        else:
            if _use_fnapi(options):
                job_type = 'FNAPI_BATCH'
            else:
                job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # TODO: Use enumerated type instead of strings for job types.
        if job_type.startswith('FNAPI_'):
            runner_harness_override = (
                dependency.get_runner_harness_container_image())
            self.debug_options.experiments = self.debug_options.experiments or []
            if runner_harness_override:
                self.debug_options.experiments.append(
                    'runner_harness_container_image=' +
                    runner_harness_override)
            # Add use_multiple_sdk_containers flag if its not already present. Do not
            # add the flag if 'no_use_multiple_sdk_containers' is present.
            # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
            # till version 2.4.
            if ('use_multiple_sdk_containers' not in self.proto.experiments
                    and 'no_use_multiple_sdk_containers'
                    not in self.proto.experiments):
                self.debug_options.experiments.append(
                    'use_multiple_sdk_containers')
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))

        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.subnetwork:
            pool.subnetwork = self.worker_options.subnetwork
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            pool.workerHarnessContainerImage = (
                dependency.get_default_container_image_for_current_sdk(
                    job_type))
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.iteritems() if v is not None
            }
            options_dict["pipelineUrl"] = pipeline_url
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
Exemple #2
0
  def __init__(self, packages, options, environment_version, pipeline_url):
    self.standard_options = options.view_as(StandardOptions)
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    self.worker_options = options.view_as(WorkerOptions)
    self.debug_options = options.view_as(DebugOptions)
    self.pipeline_url = pipeline_url
    self.proto = dataflow.Environment()
    self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
    self.proto.dataset = '{}/cloud_dataflow'.format(
        GoogleCloudOptions.BIGQUERY_API_SERVICE)
    self.proto.tempStoragePrefix = (
        self.google_cloud_options.temp_location.replace(
            'gs:/',
            GoogleCloudOptions.STORAGE_API_SERVICE))
    # User agent information.
    self.proto.userAgent = dataflow.Environment.UserAgentValue()
    self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

    if self.google_cloud_options.service_account_email:
      self.proto.serviceAccountEmail = (
          self.google_cloud_options.service_account_email)

    sdk_name, version_string = get_sdk_name_and_version()

    self.proto.userAgent.additionalProperties.extend([
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='name',
            value=to_json_value(sdk_name)),
        dataflow.Environment.UserAgentValue.AdditionalProperty(
            key='version', value=to_json_value(version_string))])
    # Version information.
    self.proto.version = dataflow.Environment.VersionValue()
    if self.standard_options.streaming:
      job_type = 'FNAPI_STREAMING'
    else:
      if _use_fnapi(options):
        job_type = 'FNAPI_BATCH'
      else:
        job_type = 'PYTHON_BATCH'
    self.proto.version.additionalProperties.extend([
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='job_type',
            value=to_json_value(job_type)),
        dataflow.Environment.VersionValue.AdditionalProperty(
            key='major', value=to_json_value(environment_version))])
    # TODO: Use enumerated type instead of strings for job types.
    if job_type.startswith('FNAPI_'):
      runner_harness_override = (
          dependency.get_runner_harness_container_image())
      if runner_harness_override:
        self.debug_options.experiments = self.debug_options.experiments or []
        self.debug_options.experiments.append(
            'runner_harness_container_image=' + runner_harness_override)
    # Experiments
    if self.debug_options.experiments:
      for experiment in self.debug_options.experiments:
        self.proto.experiments.append(experiment)
    # Worker pool(s) information.
    package_descriptors = []
    for package in packages:
      package_descriptors.append(
          dataflow.Package(
              location='%s/%s' % (
                  self.google_cloud_options.staging_location.replace(
                      'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE),
                  package),
              name=package))

    pool = dataflow.WorkerPool(
        kind='local' if self.local else 'harness',
        packages=package_descriptors,
        # https://issues.apache.org/jira/browse/BEAM-3116
        # metadata=dataflow.WorkerPool.MetadataValue(),
        taskrunnerSettings=dataflow.TaskRunnerSettings(
            parallelWorkerSettings=dataflow.WorkerSettings(
                baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                servicePath=self.google_cloud_options.dataflow_endpoint)))

    # https://issues.apache.org/jira/browse/BEAM-3116
    # pool.metadata.additionalProperties.append(
    #     dataflow.WorkerPool.MetadataValue.AdditionalProperty(
    #         key=names.STAGED_PIPELINE_URL_METADATA_FIELD, value=pipeline_url))

    pool.autoscalingSettings = dataflow.AutoscalingSettings()
    # Set worker pool options received through command line.
    if self.worker_options.num_workers:
      pool.numWorkers = self.worker_options.num_workers
    if self.worker_options.max_num_workers:
      pool.autoscalingSettings.maxNumWorkers = (
          self.worker_options.max_num_workers)
    if self.worker_options.autoscaling_algorithm:
      values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
      pool.autoscalingSettings.algorithm = {
          'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
          'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
      }.get(self.worker_options.autoscaling_algorithm)
    if self.worker_options.machine_type:
      pool.machineType = self.worker_options.machine_type
    if self.worker_options.disk_size_gb:
      pool.diskSizeGb = self.worker_options.disk_size_gb
    if self.worker_options.disk_type:
      pool.diskType = self.worker_options.disk_type
    if self.worker_options.zone:
      pool.zone = self.worker_options.zone
    if self.worker_options.network:
      pool.network = self.worker_options.network
    if self.worker_options.subnetwork:
      pool.subnetwork = self.worker_options.subnetwork
    if self.worker_options.worker_harness_container_image:
      pool.workerHarnessContainerImage = (
          self.worker_options.worker_harness_container_image)
    else:
      pool.workerHarnessContainerImage = (
          dependency.get_default_container_image_for_current_sdk(job_type))
    if self.worker_options.use_public_ips is not None:
      if self.worker_options.use_public_ips:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PUBLIC)
      else:
        pool.ipConfiguration = (
            dataflow.WorkerPool
            .IpConfigurationValueValuesEnum.WORKER_IP_PRIVATE)

    if self.standard_options.streaming:
      # Use separate data disk for streaming.
      disk = dataflow.Disk()
      if self.local:
        disk.diskType = 'local'
      # TODO(ccy): allow customization of disk.
      pool.dataDisks.append(disk)
    self.proto.workerPools.append(pool)

    sdk_pipeline_options = options.get_all_options()
    if sdk_pipeline_options:
      self.proto.sdkPipelineOptions = (
          dataflow.Environment.SdkPipelineOptionsValue())

      options_dict = {k: v
                      for k, v in sdk_pipeline_options.iteritems()
                      if v is not None}
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='options', value=to_json_value(options_dict)))

      dd = DisplayData.create_from_options(options)
      items = [item.get_dict() for item in dd.items]
      self.proto.sdkPipelineOptions.additionalProperties.append(
          dataflow.Environment.SdkPipelineOptionsValue.AdditionalProperty(
              key='display_data', value=to_json_value(items)))