Example #1
0
 def create(
         self,
         cluster_identifier: ClusterIdentifier) -> DataprocClusterManager:
     """Creates a Dataproc cluster manager provisioned for the cluster
 identified. If the cluster is known, returns an existing cluster manager.
 """
     # Try to get some not-None cluster metadata.
     cluster_metadata = self.cluster_metadata(cluster_identifier)
     if not cluster_metadata:
         raise ValueError(
             'Unknown cluster identifier: %s. Cannot create or reuse'
             'a Dataproc cluster.')
     elif cluster_metadata.region == 'global':
         # The global region is unsupported as it will be eventually deprecated.
         raise ValueError(
             'Clusters in the global region are not supported.')
     elif not cluster_metadata.region:
         _LOGGER.info(
             'No region information was detected, defaulting Dataproc cluster '
             'region to: us-central1.')
         cluster_metadata.region = 'us-central1'
     # else use the provided region.
     known_dcm = self.dataproc_cluster_managers.get(cluster_metadata, None)
     if known_dcm:
         return known_dcm
     dcm = DataprocClusterManager(cluster_metadata)
     dcm.create_flink_cluster()
     # ClusterMetadata with derivative fields populated by the dcm.
     derived_meta = dcm.cluster_metadata
     self.dataproc_cluster_managers[derived_meta] = dcm
     self.master_urls[derived_meta.master_url] = derived_meta
     # Update the default cluster metadata to the one just created.
     self.set_default_cluster(derived_meta)
     return dcm
Example #2
0
    def _create_dataproc_cluster_if_applicable(self, user_pipeline):
        """ Creates a Dataproc cluster if the provided user_pipeline is running
    FlinkRunner and no flink_master_url was provided as an option. A cluster
    is not created when a flink_master_url is detected.

    Example pipeline options to enable automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--project=my-project',
      '--region=my-region',
      '--environment_type=DOCKER'
      ])

    Example pipeline options to skip automatic Dataproc cluster creation:
      options = PipelineOptions([
      '--runner=FlinkRunner',
      '--flink_master=example.internal:41979',
      '--environment_type=DOCKER'
      ])
    """
        from apache_beam.runners.portability.flink_runner import FlinkRunner
        from apache_beam.options.pipeline_options import FlinkRunnerOptions
        flink_master = user_pipeline.options.view_as(
            FlinkRunnerOptions).flink_master
        clusters = ie.current_env().clusters
        # Only consider this logic when both below 2 conditions apply.
        if isinstance(self._underlying_runner,
                      FlinkRunner) and clusters.dataproc_cluster_managers.get(
                          str(id(user_pipeline)), None) is None:
            if flink_master == '[auto]':
                # The above condition is True when the user has not provided a
                # flink_master.
                if ie.current_env()._is_in_ipython:
                    warnings.filterwarnings(
                        'ignore',
                        'options is deprecated since First stable release. References to '
                        '<pipeline>.options will not be supported',
                        category=DeprecationWarning)
                project_id = (
                    user_pipeline.options.view_as(GoogleCloudOptions).project)
                region = (
                    user_pipeline.options.view_as(GoogleCloudOptions).region)
                cluster_name = ie.current_env().clusters.default_cluster_name
                cluster_metadata = MasterURLIdentifier(
                    project_id=project_id,
                    region=region,
                    cluster_name=cluster_name)
            else:
                cluster_metadata = clusters.master_urls.inverse.get(
                    flink_master, None)
            # else noop, no need to log anything because we allow a master_url
            # (not managed by us) provided by the user.
            if cluster_metadata:
                # create the cluster_manager and populate dicts in the clusters
                # instance if the pipeline is not already mapped to an existing
                # cluster_manager.
                cluster_manager = DataprocClusterManager(cluster_metadata)
                cluster_manager.create_flink_cluster()
                clusters.master_urls[
                    cluster_manager.master_url] = cluster_metadata
                clusters.dataproc_cluster_managers[str(
                    id(user_pipeline))] = cluster_manager
                clusters.master_urls_to_pipelines[
                    cluster_manager.master_url].append(str(id(user_pipeline)))