def get_cluster_client(self, location: Optional[str] = None) -> ClusterControllerClient:
        """Returns ClusterControllerClient."""
        client_options = {'api_endpoint': f'{location}-dataproc.googleapis.com:443'} if location else None

        return ClusterControllerClient(
            credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options
        )
    async def test_progress(self, monkeypatch):
        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_logging_client = mock.create_autospec(
            logging_v2.LoggingServiceV2Client(credentials=fake_creds))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  logging=mock_logging_client,
                                  gcs_notebooks=self.gcs_notebooks)
        spawner.project = "test-progress"

        async def collect(ait):
            items = []
            async for value in ait:
                items.append(value)
            return items

        def create_logs():
            entries = []
            for i in range(5):
                e = LogEntry(insert_id=f'entry_{i}',
                             json_payload=ParseDict(
                                 {
                                     'method': 'method',
                                     'message': f'message_{i}'
                                 }, Struct()))
                entries.append(e)
            return entries

        def create_expected():
            progress = 5
            expected = []
            i = 0
            for e in create_logs():
                progress += math.ceil((90 - progress) / 4)
                expected.append({
                    'progress': progress,
                    'message': f'method: message_{i}'
                })
                i += 1
            expected.append({'message': 'operation.done()', 'progress': 71})
            return expected

        def test_list_log_entries(*args, **kwargs):
            return create_logs()

        op = MockOperation('op1', 'cluster1-op1')

        monkeypatch.setattr(mock_logging_client, 'list_log_entries',
                            test_list_log_entries)
        monkeypatch.setattr(spawner, 'operation', op)

        _, _ = await spawner.start()
        assert await collect(spawner.progress()) == create_expected()
    def test_minimium_cluster_definition(self, monkeypatch):
        """ Some keys must always be present for JupyterHub to work. """
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/minimum.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.zone = "test-self1-b"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"

        config_built = spawner._build_cluster_config()

        assert 'project_id' in config_built
        assert 'cluster_name' in config_built

        assert config_built['project_id'] == 'test-project'
        assert config_built['cluster_name'] == 'test-clustername'

        assert config_built['config']['gce_cluster_config']['zone_uri'].split(
            '/')[-1] == 'test-self1-b'

        assert Component['JUPYTER'].value in config_built['config'][
            'software_config']['optional_components']
        assert Component['ANACONDA'].value in config_built['config'][
            'software_config']['optional_components']

        assert 'dataproc:jupyter.hub.args' in config_built['config'][
            'software_config']['properties']
        assert 'dataproc:jupyter.hub.enabled' in config_built['config'][
            'software_config']['properties']
        # assert 'dataproc:jupyter.notebook.gcs.dir' in config_built['config']['software_config']['properties']
        assert 'dataproc:jupyter.hub.env' in config_built['config'][
            'software_config']['properties']
    def test_locations(self, monkeypatch):
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/basic_uri.yaml',
                                 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic_uri.yaml',
            'cluster_zone': 'us-east1-d'
        }

        user_zone = spawner.user_options['cluster_zone']
        user_region = user_zone[:-2]

        config_built = spawner._build_cluster_config()

        assert config_built['config']['gce_cluster_config'][
            'subnetwork_uri'].split('/')[-3] == user_region
        assert config_built['config']['master_config'][
            'machine_type_uri'] == 'n1-standard-4'
        assert config_built['config']['worker_config'][
            'machine_type_uri'] == 'n1-highmem-16'
        assert config_built['config']['secondary_worker_config'][
            'machine_type_uri'] == 'n1-standard-4'
        assert config_built['config']['master_config']['accelerators'][0][
            'accelerator_type_uri'] == 'nvidia-tesla-v100'
    def test_cluster_definition_overrides(self, monkeypatch):
        """Check that config settings incompatible with JupyterHub are overwritten correctly."""
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/export.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'export.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        # Verify that we disable Component Gateway (temporarily)
        assert config_built['config']['endpoint_config'][
            'enable_http_port_access'] == False
        # Verify that we disable preemptibility (temporarily)
        assert 'preemptibility' not in config_built['config']['master_config']
        assert 'preemptibility' not in config_built['config']['worker_config']
        # Verify that we removed cluster-specific namenode properties
        assert 'hdfs:dfs.namenode.lifeline.rpc-address' not in config_built[
            'config']['software_config']['properties']
        assert 'hdfs:dfs.namenode.servicerpc-address' not in config_built[
            'config']['software_config']['properties']
    def test_cluster_definition_keep_core_values(self, monkeypatch):
        """ Some system's default values must remain no matter what. """
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/basic.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        assert config_built['project_id'] == 'test-project'
        assert config_built['cluster_name'] == 'test-clustername'

        assert config_built['config']['software_config']['properties'][
            'dataproc:jupyter.hub.args'] == 'test-args-str'
        assert config_built['config']['software_config']['properties'][
            'dataproc:jupyter.hub.enabled'] == 'true'
        # assert config_built['config']['software_config']['properties']['dataproc:jupyter.notebook.gcs.dir'] == f'gs://users-notebooks/fake'
        assert config_built['config']['software_config']['properties'][
            'dataproc:jupyter.hub.env'] == 'test-env-str'
    def test_image_version_supports_component_gateway(self):
        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        assert spawner._validate_image_version_supports_component_gateway(
            '1.3') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.3-debian9') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.3.6-debian9') is False
        assert spawner._validate_image_version_supports_component_gateway(
            '1.3.59-debian9') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.3.999-debian9') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.4-debian10') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.4.6-debian10') is False
        assert spawner._validate_image_version_supports_component_gateway(
            '1.4.31-debian10') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.5-debian10') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.5.0-debian10') is False
        assert spawner._validate_image_version_supports_component_gateway(
            '1.5.5-debian10') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '2') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '2.0') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '2.0.0') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '2.3.0') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '2.0.0-RC1-preview') is True
        assert spawner._validate_image_version_supports_component_gateway(
            'weird-unexpected-version-124.3.v2.2020-02-15') is True
        assert spawner._validate_image_version_supports_component_gateway(
            '1.3.weird-version-again') is True
    async def test_poll_no_cluster(self):

        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_client.get_cluster.return_value = None

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        spawner.project = 'test-poll-no-cluster'
        assert spawner.project == 'test-poll-no-cluster'
        assert await spawner.poll() == 1
    def test_clean_gcs_path(self, monkeypatch):
        path = "gs://bucket/config/"

        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        assert spawner._clean_gcs_path(path) == "gs://bucket/config"
        assert spawner._clean_gcs_path(path,
                                       return_gs=False) == "bucket/config"
        assert spawner._clean_gcs_path(
            path, return_slash=True) == "gs://bucket/config/"
    async def test_domain_scoped_zonal_dns(self):
        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        spawner.project = "test:domain-scoped"
        assert spawner.project == "test:domain-scoped"

        (ip, port) = await spawner.start()
        assert ip == f'dataprochub-fake-m.{self.zone}.c.domain-scoped.test.internal'
        assert port == 0
    def test_cluster_definition_check_core_fields(self, monkeypatch):
        """ Values chosen by the user through the form overwrites others. If the
    admin wants to prevent that behavior, they should remove form elements.
    TODO(mayran): Check keys so users can not add custom ones. """
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/basic.yaml', 'r').read()
            return config_string

        def test_username(*args, **kwargs):
            return 'foo-user'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "get_username", test_username)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.cluster_name_pattern = 'my-cluster-{}'
        spawner.user_options = {
            'cluster_type': 'basic.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        assert config_built['cluster_name'] == 'my-cluster-foo-user'
        assert config_built['project_id'] == 'test-project'
    def test_duration(self, monkeypatch):
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/duration.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'duration.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        # Test 600s string
        assert config_built['config']['initialization_actions'][0][
            'execution_timeout']['seconds'] == 600
        # Test Duration protobuf
        assert config_built['config']['initialization_actions'][1][
            'execution_timeout']['seconds'] == 600
    def test_metadata(self, monkeypatch):
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/basic.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        assert config_built['config']['gce_cluster_config']['metadata'] == {
            'm1': 'v1',
            'm2': 'v2',
            'session-user': MockUser.name
        }
    async def test_start_normal(self):
        operation = operations_pb2.Operation()

        # Mock the Dataproc API client
        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_client.create_cluster.return_value = operation

        # Force no existing clusters to bypass the check in the spawner
        mock_client.get_cluster.return_value = None

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Test that the traitlets work
        spawner.project = 'test-create'
        assert spawner.project == 'test-create'
        assert spawner.region == self.region

        (ip, port) = await spawner.start()
        assert ip == f'dataprochub-fake-m.{self.zone}.c.{spawner.project}.internal'
        # JupyterHub defaults to 0 if no port set
        assert port == 0

        mock_client.create_cluster.assert_called_once()

        assert spawner.cluster_definition['cluster_name'] == 'dataprochub-fake'
        assert (
            spawner.cluster_definition['config']['gce_cluster_config']
            ['zone_uri']
        ) == (
            f'https://www.googleapis.com/compute/v1/projects/{spawner.project}/zones/{spawner.zone}'
        )

        env = json.loads(
            spawner.cluster_definition['config']['software_config']
            ['properties']['dataproc:jupyter.hub.env'])
        assert env['JUPYTERHUB_API_URL'] is not None
    async def test_start_existing_clustername(self):

        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        spawner.project = "test-create-existing"
        assert spawner.project == "test-create-existing"

        (ip, port) = await spawner.start()
        assert ip == f'dataprochub-fake-m.{self.zone}.c.{spawner.project}.internal'
        assert port == 0

        mock_client.create_cluster.assert_not_called()
    async def test_poll_create(self):

        expected_response = {'status': {'state': ClusterStatus.State.CREATING}}
        expected_response = Cluster(**expected_response)

        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_client.get_cluster.return_value = expected_response

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        spawner.project = 'test-poll-create'
        assert spawner.project == 'test-poll-create'

        assert await spawner.poll() == None
    async def test_stop_normal(self):

        fake_creds = AnonymousCredentials()
        mock_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))

        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        spawner.project = 'test-stop'
        assert spawner.project == 'test-stop'
        assert spawner.region == self.region

        response = await spawner.stop()

        mock_client.delete_cluster.assert_called_once_with(
            project_id='test-stop',
            region=self.region,
            cluster_name='dataprochub-fake')
Example #18
0
    def get_cluster_client(
            self,
            region: Optional[str] = None,
            location: Optional[str] = None) -> ClusterControllerClient:
        """Returns ClusterControllerClient."""
        if location is not None:
            warnings.warn(
                "Parameter `location` will be deprecated. "
                "Please provide value through `region` parameter instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            region = location
        client_options = None
        if region and region != 'global':
            client_options = {
                'api_endpoint': f'{region}-dataproc.googleapis.com:443'
            }

        return ClusterControllerClient(credentials=self._get_credentials(),
                                       client_info=self.client_info,
                                       client_options=client_options)
    def test_validate_proto(self, monkeypatch):
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/unknown_fields.yaml',
                                 'r').read()
            return config_string

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic_uri.yaml',
            'cluster_zone': 'us-east1-d'
        }

        cleaned_config = spawner.get_cluster_definition('')
        warnings = dataprocspawner.spawner._validate_proto(
            cleaned_config, Cluster)

        # Check that we had appropriate warning messages
        assert len(warnings) == 7
        expected_warnings = [
            'Removing unknown/bad value BAD_ENUM_VALUE for field consume_reservation_type.',
            "Removing unknown field unknown_field for class <class 'google.cloud.dataproc_v1beta2.types.clusters.NodeInitializationAction'>",
            'Removing unknown/bad value UNKNOWN_COMPONENT_1 for field optional_components.',
            'Removing unknown/bad value UNKNOWN_COMPONENT_2 for field optional_components.',
            'Removing unknown/bad value UNKNOWN_COMPONENT_3 for field optional_components.',
            "Removing unknown field unknown_field_config_level for class <class 'google.cloud.dataproc_v1beta2.types.clusters.ClusterConfig'>",
            "Removing unknown field unknown_field_top_level for class <class 'google.cloud.dataproc_v1beta2.types.clusters.Cluster'>",
        ]
        for w in expected_warnings:
            assert w in warnings, f'Expected message {w} in warnings {warnings}'

        raw_config = spawner.get_cluster_definition('')
        # Construct expected output
        del raw_config['unknown_field_top_level']
        del raw_config['config']['unknown_field_config_level']
        del raw_config['config']['initialization_actions'][0]['unknown_field']
        del raw_config['config']['gce_cluster_config']['reservation_affinity'][
            'consume_reservation_type']
        raw_config['config']['software_config']['optional_components'] = [
            'JUPYTER', 'ZEPPELIN', 'ANACONDA', 'PRESTO'
        ]

        # Coerce both of the outputs to proto so we can easily compare equality
        # this also sanity checks that we have actually stripped all unknown/bad
        # fields
        actual_proto = Cluster(cleaned_config)
        expected_proto = Cluster(raw_config)

        assert actual_proto == expected_proto

        # Now check that the config with resolved fields is correct as well
        config_built = spawner._build_cluster_config()
        print(config_built)

        assert 'unknown_field_top_level' not in config_built
        assert 'unknown_field_config_level' not in config_built['config']
        assert 'unknown_field' not in config_built['config'][
            'initialization_actions'][0]
        assert 'consume_reservation_type' not in config_built['config'][
            'gce_cluster_config']['reservation_affinity']
        assert raw_config['config']['software_config'][
            'optional_components'] == [
                'JUPYTER', 'ZEPPELIN', 'ANACONDA', 'PRESTO'
            ]
    def test_uris(self, monkeypatch):
        """ Test that all official URI patterns work and geo location match."""
        import yaml

        def test_read_file_string(*args, **kwargs):
            config_string = open('./tests/test_data/basic.yaml', 'r').read()
            return config_string

        def test_read_file_uri(*args, **kwargs):
            config_string = open('./tests/test_data/basic_uri.yaml',
                                 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file_string)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        assert config_built['config']['gce_cluster_config'][
            'subnetwork_uri'] == "default"

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file_uri)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'basic.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        assert config_built['config']['gce_cluster_config'][
            'subnetwork_uri'] == "projects/test-project/regions/us-east1/subnetworks/default"
    def test_camel_case(self, monkeypatch):
        import yaml

        def test_read_file(*args, **kwargs):
            config_string = open('./tests/test_data/custom.yaml', 'r').read()
            return config_string

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(spawner, "read_gcs_file", test_read_file)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.region = "us-east1"
        spawner.zone = "us-east1-d"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.user_options = {
            'cluster_type': 'custom.yaml',
            'cluster_zone': 'test-form1-a'
        }

        config_built = spawner._build_cluster_config()

        expected_dict = {
            'project_id': 'test-project',
            'labels': {
                'goog-dataproc-notebook-spawner': 'unknown'
            },
            'cluster_name': 'test-clustername',
            'config': {
                'autoscaling_config': {
                    'policy_uri':
                    'projects/my-project/regions/us-east1/autoscalingPolicies/policy-abc123'
                },
                'config_bucket': 'bucket-dash',
                'endpoint_config': {
                    'enable_http_port_access': True
                },
                'gce_cluster_config': {
                    'metadata': {
                        'KeyCamelCase': 'UlowUlow',
                        'key_with_underscore':
                        'https://downloads.io/protected/files/enterprise-trial.tar.gz',
                        'key_with_underscore_too':
                        'some_UPPER_and_UlowerU:1234',
                        'session-user': MockUser.name
                    },
                    'zone_uri':
                    'https://www.googleapis.com/compute/v1/projects/test-project/zones/test-form1-a'
                },
                'initialization_actions': [],
                'lifecycle_config': {},
                'master_config': {
                    'machine_type_uri': 'machine.1.2_numbers',
                    'min_cpu_platform': 'AUTOMATIC',
                    'disk_config': {
                        'boot_disk_size_gb': 1000
                    },
                },
                'software_config': {
                    'image_version':
                    '1.4-debian9',
                    'optional_components':
                    [Component.JUPYTER.value, Component.ANACONDA.value],
                    'properties': {
                        'dataproc:jupyter.hub.args': 'test-args-str',
                        'dataproc:jupyter.hub.enabled': 'true',
                        'dataproc:jupyter.hub.env': 'test-env-str',
                        'dataproc:jupyter.notebook.gcs.dir':
                        'gs://users-notebooks/fake',
                        'key-with-dash:UPPER_UPPER': '4000',
                        'key-with-dash-too:UlowUlowUlow': '85196m',
                        'key:and.multiple.dots.lowUlowUlow': '13312m'
                    }
                }
            }
        }
        assert expected_dict == config_built
    def test_config_paths(self, monkeypatch):
        """ Checks that configuration paths are found. """

        config_hierarchy = [
            "bucket/listme/file_L1.yaml", "bucket/config/file_A1.yaml",
            "bucket/config/file_A2.yaml", "bucket/file_B1.yaml",
            "bucket-two/config/two/file_C1.yaml"
        ]

        expected = config_hierarchy

        def test_list_blobs(*args, **kwargs):
            """ Rewrites library function to reads a custom list of paths vs real GCS.
      https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/client.html#Client.list_blobs
      """
            bucket_or_name = args[0]
            prefix = kwargs['prefix']
            candidate_path = f'{bucket_or_name}/{prefix}'
            config_paths = []

            for c in config_hierarchy:
                if c.startswith(candidate_path):
                    fn = '/'.join(c.split('/')[1:])
                    b = Blob(bucket='dummy', name=fn)
                    config_paths.append(b)

            return iter(config_paths)

        def test_clustername(*args, **kwargs):
            return 'test-clustername'

        fake_creds = AnonymousCredentials()
        mock_dataproc_client = mock.create_autospec(
            ClusterControllerClient(credentials=fake_creds))
        mock_gcs_client = mock.create_autospec(
            storage.Client(credentials=fake_creds, project='project'))
        spawner = DataprocSpawner(hub=Hub(),
                                  dataproc=mock_dataproc_client,
                                  gcs=mock_gcs_client,
                                  user=MockUser(),
                                  _mock=True,
                                  gcs_notebooks=self.gcs_notebooks)

        # Prevents a call to GCS. We return the local file instead.
        monkeypatch.setattr(mock_gcs_client, "list_blobs", test_list_blobs)
        monkeypatch.setattr(spawner, "clustername", test_clustername)

        spawner.project = "test-project"
        spawner.zone = "test-self1-b"
        spawner.env_str = "test-env-str"
        spawner.args_str = "test-args-str"
        spawner.dataproc_configs = (
            "gs://bucket/config/,"
            "bucket/config/file_A1.yaml,"
            "bucket/file_B1.yaml,"
            "bucket-notexist/file.yaml,"
            "bucket/file-notexist.yaml,"
            "bucket/listme/,"
            "bucket/config-notexist/file.yaml,"
            "gs://bucket/listme/,bucket/config,bucket-two,")

        read_paths = spawner._list_gcs_files(spawner.dataproc_configs)

        assert type(read_paths) == type(config_hierarchy)
        assert len(read_paths) == len(config_hierarchy)
        assert set(read_paths) == set(config_hierarchy)