def test_minimium_cluster_definition(self, monkeypatch): """ Some keys must always be present for JupyterHub to work. """ import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/minimum.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.zone = "test-self1-b" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" config_built = spawner._build_cluster_config() assert 'project_id' in config_built assert 'cluster_name' in config_built assert config_built['project_id'] == 'test-project' assert config_built['cluster_name'] == 'test-clustername' assert config_built['config']['gce_cluster_config']['zone_uri'].split( '/')[-1] == 'test-self1-b' assert Component['JUPYTER'].value in config_built['config'][ 'software_config']['optional_components'] assert Component['ANACONDA'].value in config_built['config'][ 'software_config']['optional_components'] assert 'dataproc:jupyter.hub.args' in config_built['config'][ 'software_config']['properties'] assert 'dataproc:jupyter.hub.enabled' in config_built['config'][ 'software_config']['properties'] # assert 'dataproc:jupyter.notebook.gcs.dir' in config_built['config']['software_config']['properties'] assert 'dataproc:jupyter.hub.env' in config_built['config'][ 'software_config']['properties']
def test_locations(self, monkeypatch): import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/basic_uri.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic_uri.yaml', 'cluster_zone': 'us-east1-d' } user_zone = spawner.user_options['cluster_zone'] user_region = user_zone[:-2] config_built = spawner._build_cluster_config() assert config_built['config']['gce_cluster_config'][ 'subnetwork_uri'].split('/')[-3] == user_region assert config_built['config']['master_config'][ 'machine_type_uri'] == 'n1-standard-4' assert config_built['config']['worker_config'][ 'machine_type_uri'] == 'n1-highmem-16' assert config_built['config']['secondary_worker_config'][ 'machine_type_uri'] == 'n1-standard-4' assert config_built['config']['master_config']['accelerators'][0][ 'accelerator_type_uri'] == 'nvidia-tesla-v100'
def test_cluster_definition_overrides(self, monkeypatch): """Check that config settings incompatible with JupyterHub are overwritten correctly.""" import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/export.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'export.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() # Verify that we disable Component Gateway (temporarily) assert config_built['config']['endpoint_config'][ 'enable_http_port_access'] == False # Verify that we disable preemptibility (temporarily) assert 'preemptibility' not in config_built['config']['master_config'] assert 'preemptibility' not in config_built['config']['worker_config'] # Verify that we removed cluster-specific namenode properties assert 'hdfs:dfs.namenode.lifeline.rpc-address' not in config_built[ 'config']['software_config']['properties'] assert 'hdfs:dfs.namenode.servicerpc-address' not in config_built[ 'config']['software_config']['properties']
def test_cluster_definition_keep_core_values(self, monkeypatch): """ Some system's default values must remain no matter what. """ import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/basic.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() assert config_built['project_id'] == 'test-project' assert config_built['cluster_name'] == 'test-clustername' assert config_built['config']['software_config']['properties'][ 'dataproc:jupyter.hub.args'] == 'test-args-str' assert config_built['config']['software_config']['properties'][ 'dataproc:jupyter.hub.enabled'] == 'true' # assert config_built['config']['software_config']['properties']['dataproc:jupyter.notebook.gcs.dir'] == f'gs://users-notebooks/fake' assert config_built['config']['software_config']['properties'][ 'dataproc:jupyter.hub.env'] == 'test-env-str'
def test_duration(self, monkeypatch): import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/duration.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'duration.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() # Test 600s string assert config_built['config']['initialization_actions'][0][ 'execution_timeout']['seconds'] == 600 # Test Duration protobuf assert config_built['config']['initialization_actions'][1][ 'execution_timeout']['seconds'] == 600
def test_cluster_definition_check_core_fields(self, monkeypatch): """ Values chosen by the user through the form overwrites others. If the admin wants to prevent that behavior, they should remove form elements. TODO(mayran): Check keys so users can not add custom ones. """ import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/basic.yaml', 'r').read() return config_string def test_username(*args, **kwargs): return 'foo-user' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "get_username", test_username) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.cluster_name_pattern = 'my-cluster-{}' spawner.user_options = { 'cluster_type': 'basic.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() assert config_built['cluster_name'] == 'my-cluster-foo-user' assert config_built['project_id'] == 'test-project'
def test_metadata(self, monkeypatch): import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/basic.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() assert config_built['config']['gce_cluster_config']['metadata'] == { 'm1': 'v1', 'm2': 'v2', 'session-user': MockUser.name }
def test_validate_proto(self, monkeypatch): import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/unknown_fields.yaml', 'r').read() return config_string fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic_uri.yaml', 'cluster_zone': 'us-east1-d' } cleaned_config = spawner.get_cluster_definition('') warnings = dataprocspawner.spawner._validate_proto( cleaned_config, Cluster) # Check that we had appropriate warning messages assert len(warnings) == 7 expected_warnings = [ 'Removing unknown/bad value BAD_ENUM_VALUE for field consume_reservation_type.', "Removing unknown field unknown_field for class <class 'google.cloud.dataproc_v1beta2.types.clusters.NodeInitializationAction'>", 'Removing unknown/bad value UNKNOWN_COMPONENT_1 for field optional_components.', 'Removing unknown/bad value UNKNOWN_COMPONENT_2 for field optional_components.', 'Removing unknown/bad value UNKNOWN_COMPONENT_3 for field optional_components.', "Removing unknown field unknown_field_config_level for class <class 'google.cloud.dataproc_v1beta2.types.clusters.ClusterConfig'>", "Removing unknown field unknown_field_top_level for class <class 'google.cloud.dataproc_v1beta2.types.clusters.Cluster'>", ] for w in expected_warnings: assert w in warnings, f'Expected message {w} in warnings {warnings}' raw_config = spawner.get_cluster_definition('') # Construct expected output del raw_config['unknown_field_top_level'] del raw_config['config']['unknown_field_config_level'] del raw_config['config']['initialization_actions'][0]['unknown_field'] del raw_config['config']['gce_cluster_config']['reservation_affinity'][ 'consume_reservation_type'] raw_config['config']['software_config']['optional_components'] = [ 'JUPYTER', 'ZEPPELIN', 'ANACONDA', 'PRESTO' ] # Coerce both of the outputs to proto so we can easily compare equality # this also sanity checks that we have actually stripped all unknown/bad # fields actual_proto = Cluster(cleaned_config) expected_proto = Cluster(raw_config) assert actual_proto == expected_proto # Now check that the config with resolved fields is correct as well config_built = spawner._build_cluster_config() print(config_built) assert 'unknown_field_top_level' not in config_built assert 'unknown_field_config_level' not in config_built['config'] assert 'unknown_field' not in config_built['config'][ 'initialization_actions'][0] assert 'consume_reservation_type' not in config_built['config'][ 'gce_cluster_config']['reservation_affinity'] assert raw_config['config']['software_config'][ 'optional_components'] == [ 'JUPYTER', 'ZEPPELIN', 'ANACONDA', 'PRESTO' ]
def test_uris(self, monkeypatch): """ Test that all official URI patterns work and geo location match.""" import yaml def test_read_file_string(*args, **kwargs): config_string = open('./tests/test_data/basic.yaml', 'r').read() return config_string def test_read_file_uri(*args, **kwargs): config_string = open('./tests/test_data/basic_uri.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file_string) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() assert config_built['config']['gce_cluster_config'][ 'subnetwork_uri'] == "default" # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file_uri) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'basic.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() assert config_built['config']['gce_cluster_config'][ 'subnetwork_uri'] == "projects/test-project/regions/us-east1/subnetworks/default"
def test_camel_case(self, monkeypatch): import yaml def test_read_file(*args, **kwargs): config_string = open('./tests/test_data/custom.yaml', 'r').read() return config_string def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(spawner, "read_gcs_file", test_read_file) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.region = "us-east1" spawner.zone = "us-east1-d" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.user_options = { 'cluster_type': 'custom.yaml', 'cluster_zone': 'test-form1-a' } config_built = spawner._build_cluster_config() expected_dict = { 'project_id': 'test-project', 'labels': { 'goog-dataproc-notebook-spawner': 'unknown' }, 'cluster_name': 'test-clustername', 'config': { 'autoscaling_config': { 'policy_uri': 'projects/my-project/regions/us-east1/autoscalingPolicies/policy-abc123' }, 'config_bucket': 'bucket-dash', 'endpoint_config': { 'enable_http_port_access': True }, 'gce_cluster_config': { 'metadata': { 'KeyCamelCase': 'UlowUlow', 'key_with_underscore': 'https://downloads.io/protected/files/enterprise-trial.tar.gz', 'key_with_underscore_too': 'some_UPPER_and_UlowerU:1234', 'session-user': MockUser.name }, 'zone_uri': 'https://www.googleapis.com/compute/v1/projects/test-project/zones/test-form1-a' }, 'initialization_actions': [], 'lifecycle_config': {}, 'master_config': { 'machine_type_uri': 'machine.1.2_numbers', 'min_cpu_platform': 'AUTOMATIC', 'disk_config': { 'boot_disk_size_gb': 1000 }, }, 'software_config': { 'image_version': '1.4-debian9', 'optional_components': [Component.JUPYTER.value, Component.ANACONDA.value], 'properties': { 'dataproc:jupyter.hub.args': 'test-args-str', 'dataproc:jupyter.hub.enabled': 'true', 'dataproc:jupyter.hub.env': 'test-env-str', 'dataproc:jupyter.notebook.gcs.dir': 'gs://users-notebooks/fake', 'key-with-dash:UPPER_UPPER': '4000', 'key-with-dash-too:UlowUlowUlow': '85196m', 'key:and.multiple.dots.lowUlowUlow': '13312m' } } } } assert expected_dict == config_built
def test_config_paths(self, monkeypatch): """ Checks that configuration paths are found. """ config_hierarchy = [ "bucket/listme/file_L1.yaml", "bucket/config/file_A1.yaml", "bucket/config/file_A2.yaml", "bucket/file_B1.yaml", "bucket-two/config/two/file_C1.yaml" ] expected = config_hierarchy def test_list_blobs(*args, **kwargs): """ Rewrites library function to reads a custom list of paths vs real GCS. https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/client.html#Client.list_blobs """ bucket_or_name = args[0] prefix = kwargs['prefix'] candidate_path = f'{bucket_or_name}/{prefix}' config_paths = [] for c in config_hierarchy: if c.startswith(candidate_path): fn = '/'.join(c.split('/')[1:]) b = Blob(bucket='dummy', name=fn) config_paths.append(b) return iter(config_paths) def test_clustername(*args, **kwargs): return 'test-clustername' fake_creds = AnonymousCredentials() mock_dataproc_client = mock.create_autospec( ClusterControllerClient(credentials=fake_creds)) mock_gcs_client = mock.create_autospec( storage.Client(credentials=fake_creds, project='project')) spawner = DataprocSpawner(hub=Hub(), dataproc=mock_dataproc_client, gcs=mock_gcs_client, user=MockUser(), _mock=True, gcs_notebooks=self.gcs_notebooks) # Prevents a call to GCS. We return the local file instead. monkeypatch.setattr(mock_gcs_client, "list_blobs", test_list_blobs) monkeypatch.setattr(spawner, "clustername", test_clustername) spawner.project = "test-project" spawner.zone = "test-self1-b" spawner.env_str = "test-env-str" spawner.args_str = "test-args-str" spawner.dataproc_configs = ( "gs://bucket/config/," "bucket/config/file_A1.yaml," "bucket/file_B1.yaml," "bucket-notexist/file.yaml," "bucket/file-notexist.yaml," "bucket/listme/," "bucket/config-notexist/file.yaml," "gs://bucket/listme/,bucket/config,bucket-two,") read_paths = spawner._list_gcs_files(spawner.dataproc_configs) assert type(read_paths) == type(config_hierarchy) assert len(read_paths) == len(config_hierarchy) assert set(read_paths) == set(config_hierarchy)