def test_entity_uri_parse_glob_failure(self): """ """ entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \ "/lake-id/zones/zone-id/entities/test_entity_*" # This should be supported eventually with pytest.raises(NotImplementedError): EntityUri.from_uri(entity_uri)
def test_entity_uri_parse_asset_id_failure(self): """ """ entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \ "/lake-id/zones/zone-id/assets/asset-id" with pytest.raises(ValueError): EntityUri.from_uri(entity_uri)
def from_bq_entity_uri(self, entity_uri: EntityUri, bigquery_client: BigQueryClient) -> DqEntity: project_id = entity_uri.get_configs("projects") table_name = entity_uri.get_table_name() configs = entity_uri.configs_dict entity_id = entity_uri.get_entity_id() columns_dict = bigquery_client.get_table_schema(table=table_name, project_id=project_id) entity_configs = { "source_database": "BIGQUERY", "resource_type": "BIGQUERY", "table_name": configs.get("tables"), "dataset_name": configs.get("datasets"), "project_name": configs.get("projects"), "columns": columns_dict.get("columns"), "environment_override": {}, "entity_id": entity_id, "dataplex_name": None, "dataplex_lake": None, "dataplex_zone": None, "dataplex_location": None, "dataplex_asset_id": None, "dataplex_createTime": None, "dataplex_updateTime": None, "partition_fields": columns_dict.get("partition_fields"), } return DqEntity.from_dict(entity_id=entity_id.upper(), kwargs=entity_configs)
def test_entity_uri_parse_dataplex_uri_without_default_configs(self): """ """ entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \ "/lake-id/zones/zone-id/entities/entity-id" parsed_uri = EntityUri.from_uri(entity_uri) expected_entity_dict = { "uri": "dataplex://projects/project-id/locations/us-central1/lakes" "/lake-id/zones/zone-id/entities/entity-id", "scheme": "DATAPLEX", "entity_id": "entity-id", "db_primary_key": "projects/project-id/locations/us-central1/lakes/lake-id/zones/zone-id/entities/entity-id", # noqa: E501 "configs": { "projects": "project-id", "locations": "us-central1", "lakes": "lake-id", "zones": "zone-id", "entities": "entity-id", } } assert parsed_uri.scheme == expected_entity_dict["scheme"] assert parsed_uri.uri_configs_string == ( "projects/project-id/locations/us-central1/lakes" "/lake-id/zones/zone-id/entities/entity-id") assert parsed_uri.default_configs is None assert parsed_uri.complete_uri_string == expected_entity_dict["uri"] assert parsed_uri.get_entity_id() == expected_entity_dict["entity_id"] assert parsed_uri.configs_dict == expected_entity_dict["configs"] assert parsed_uri.get_db_primary_key( ) == expected_entity_dict["db_primary_key"] assert parsed_uri.to_dict() == expected_entity_dict
def test_entity_uri_parse_bigquery_uri_without_default_configs(self): """ """ bigquery_uri = "bigquery://projects/project-id/datasets/dataset-id/tables/table-id" parsed_uri = EntityUri.from_uri(bigquery_uri) print(parsed_uri) expected_entity_dict = { "uri": "bigquery://projects/project-id/datasets/dataset-id/tables/table-id", "scheme": "BIGQUERY", "entity_id": "projects/project-id/datasets/dataset-id/tables/table-id", "db_primary_key": "projects/project-id/datasets/dataset-id/tables/table-id", "configs": { "projects": "project-id", "datasets": "dataset-id", "tables": "table-id", } } assert parsed_uri.scheme == expected_entity_dict["scheme"] assert parsed_uri.uri_configs_string == "projects/project-id/datasets/dataset-id/tables/table-id" assert parsed_uri.default_configs is None assert parsed_uri.complete_uri_string == expected_entity_dict["uri"] assert parsed_uri.get_entity_id() == expected_entity_dict["entity_id"] assert parsed_uri.get_db_primary_key( ) == expected_entity_dict["db_primary_key"] assert parsed_uri.configs_dict == expected_entity_dict["configs"] assert parsed_uri.to_dict() == expected_entity_dict
def test_entity_uri_parse_override_project_lake_id_failure(self): """ """ entity_uri = "dataplex://projects/project-id-2/zones/zone-id/entities/entity-id" default_configs = { "projects": "project-id-1", "locations": "us-central1", "lakes": "lake-id", } # This should fail without metadata_defaults with pytest.raises(ValueError): EntityUri.from_uri(entity_uri) parsed_uri = EntityUri.from_uri(uri_string=entity_uri, default_configs=default_configs) assert parsed_uri.complete_uri_string == entity_uri assert parsed_uri.get_db_primary_key() == ( "projects/project-id-2/locations/us-central1/lakes" "/lake-id/zones/zone-id/entities/entity-id")
def _resolve_dataplex_entity_uri( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, bigquery_client: BigQueryClient, ) -> dq_entity.DqEntity: dataplex_entity = dataplex_client.get_dataplex_entity( gcp_project_id=entity_uri.get_configs("projects"), location_id=entity_uri.get_configs("locations"), lake_name=entity_uri.get_configs("lakes"), zone_id=entity_uri.get_configs("zones"), entity_id=entity_uri.get_entity_id(), ) clouddq_entity = dq_entity.DqEntity.from_dataplex_entity( entity_id=entity_uri.get_db_primary_key(), dataplex_entity=dataplex_entity, ) entity_uri_primary_key = entity_uri.get_db_primary_key().upper() gcs_entity_external_table_name = clouddq_entity.get_table_name() logger.debug( f"GCS Entity External Table Name is {gcs_entity_external_table_name}" ) bq_table_exists = bigquery_client.is_table_exists( table=gcs_entity_external_table_name, project_id=clouddq_entity.instance_name, ) if bq_table_exists: logger.debug( f"The External Table {gcs_entity_external_table_name} for Entity URI " f"{entity_uri_primary_key} exists in Bigquery.") else: raise RuntimeError( f"Unable to find Bigquery External Table {gcs_entity_external_table_name} " f"for Entity URI {entity_uri_primary_key}") return clouddq_entity
def _resolve_bigquery_entity_uri( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, bigquery_client: BigQueryClient, ) -> dq_entity.DqEntity: required_arguments = ["projects", "datasets", "tables"] for argument in required_arguments: uri_argument = entity_uri.get_configs(argument) if not uri_argument: raise RuntimeError( f"Failed to retrieve default Bigquery '{argument}' for " f"entity_uri: {entity_uri.complete_uri_string}. \n" f"'{argument}' is a required argument to look-up metadata for the entity_uri " "using Bigquery API.\n") project_id = entity_uri.get_configs("projects") table_name = entity_uri.get_table_name() bq_table_exists = bigquery_client.is_table_exists( table=table_name, project_id=project_id) if bq_table_exists: logger.debug(f"The Table '{table_name}' in the " f"specified entity_uri '{entity_uri}' " f"exists in Bigquery.") dataplex_entity = self.is_dataplex_entity( entity_uri=entity_uri, dataplex_client=dataplex_client) if dataplex_entity: clouddq_entity = dataplex_entity else: clouddq_entity = dq_entity.DqEntity.from_bq_entity_uri( entity_uri=entity_uri, bigquery_client=bigquery_client, ) return clouddq_entity else: raise RuntimeError( f"Bigquery Table '{table_name}' specified in the " f"entity uri '{entity_uri}' does not exist")
def is_dataplex_entity( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, ): required_arguments = ["projects", "lakes", "locations", "zones"] for argument in required_arguments: uri_argument = entity_uri.get_configs(argument) if not uri_argument: logger.info( f"Failed to retrieve default Dataplex '{argument}' for " f"entity_uri: {entity_uri.complete_uri_string}. \n" f"'{argument}' is a required argument to look-up metadata for the entity_uri " "using Dataplex Metadata API.\n" "Ensure the BigQuery dataset containing this table " "is attached as an asset in Dataplex.\n" "You can then specify the corresponding Dataplex " "projects/locations/lakes/zones as part of the " "metadata_default_registries YAML configs, e.g.\n" f"{SAMPLE_DEFAULT_REGISTRIES_YAML}") return False dataplex_entities_match = dataplex_client.list_dataplex_entities( gcp_project_id=entity_uri.get_configs("projects"), location_id=entity_uri.get_configs("locations"), lake_name=entity_uri.get_configs("lakes"), zone_id=entity_uri.get_configs("zones"), data_path=entity_uri.get_entity_id(), ) logger.info( f"Retrieved Dataplex Entities:\n{pformat(dataplex_entities_match)}" ) if len(dataplex_entities_match) != 1: logger.info("Failed to retrieve Dataplex Metadata entry for " f"entity_uri '{entity_uri.complete_uri_string}' " f"with error:\n" f"{pformat(json.dumps(dataplex_entities_match))}\n\n" f"Parsed entity_uri configs:\n" f"{pformat(entity_uri.to_dict())}\n\n") return False else: dataplex_entity = dataplex_entities_match[0] clouddq_entity = dq_entity.DqEntity.from_dataplex_entity( entity_id=entity_uri.get_db_primary_key(), dataplex_entity=dataplex_entity, ) return clouddq_entity
def test_dq_entity_parse_bigquery_uri(self, gcp_project_id, test_dataplex_metadata_defaults_configs, test_bigquery_client,): bq_entity_uri_string = f"bigquery://projects/{gcp_project_id}/datasets/" \ f"austin_311/tables/contact_details_partitioned" bq_entity_uri = EntityUri.from_uri( uri_string=bq_entity_uri_string, default_configs=test_dataplex_metadata_defaults_configs, ) clouddq_entity = DqEntity.from_bq_entity_uri( entity_uri=bq_entity_uri, bigquery_client=test_bigquery_client) clouddq_entity_expected_dict = {bq_entity_uri_string.upper().split("://")[1]: { 'source_database': 'BIGQUERY', 'table_name': 'contact_details_partitioned', 'database_name': 'austin_311', 'instance_name': f'{gcp_project_id}', 'columns': { 'ROW_ID': {'name': 'row_id', 'data_type': 'STRING'}, 'CONTACT_TYPE': {'name': 'contact_type', 'data_type': 'STRING'}, 'VALUE': {'name': 'value', 'data_type': 'STRING'}, 'TS': {'name': 'ts', 'data_type': 'TIMESTAMP'} }, 'resource_type': 'BIGQUERY', 'partition_fields': [{ 'name': 'ts', 'type': 'TIMESTAMP', 'partitioning_type': 'DAY' }], 'dataset_name': 'austin_311', 'project_name': f'{gcp_project_id}' }} assert clouddq_entity.to_dict() == clouddq_entity_expected_dict
def test_entity_uri_parse_failure(self, entity_uri, error_type): """ """ with pytest.raises(error_type): EntityUri.from_uri(entity_uri)
def from_dict( cls: DqRuleBinding, rule_binding_id: str, kwargs: dict, default_configs: dict | None = None, validate_uri: bool = True, ) -> DqRuleBinding: """ Args: cls: DqRuleBinding: rule_binding_id: typing.Union[str, str]: kwargs: typing.Dict: Returns: """ entity_config: dict = get_keys_from_dict_and_assert_oneof( config_id=rule_binding_id, kwargs=kwargs, keys=["entity_uri", "entity_id"], ) if "entity_id" in entity_config: entity_id = entity_config["entity_id"] entity_uri = None if "entity_uri" in entity_config: parsed_entity_uri = EntityUri.from_uri( entity_config["entity_uri"], default_configs=default_configs, validate_uri=validate_uri, ) entity_id = parsed_entity_uri.get_entity_id() entity_uri = parsed_entity_uri if entity_id: entity_id.upper() column_id: str = get_from_dict_and_assert( config_id=rule_binding_id, kwargs=kwargs, key="column_id", ) if column_id: column_id.upper() row_filter_id: str = get_from_dict_and_assert( config_id=rule_binding_id, kwargs=kwargs, key="row_filter_id", ) if row_filter_id: row_filter_id.upper() rule_ids: list[str] = get_from_dict_and_assert( config_id=rule_binding_id, kwargs=kwargs, key="rule_ids", assertion=lambda x: type(x) == list, error_msg= f"Rule Binding ID: '{rule_binding_id}' must have defined value " f"'rule_ids' of type 'list'.", ) incremental_time_filter_column_id: str | None = kwargs.get( "incremental_time_filter_column_id", None) if incremental_time_filter_column_id: incremental_time_filter_column_id.upper() metadata: dict | None = kwargs.get("metadata", dict()) if type(metadata) != dict: raise ValueError( f"Rule Binding ID: '{rule_binding_id}' has invalid " f"metadata field with type {type(metadata)} and values: {metadata}\n" "'metadata' must be of type dictionary.") return DqRuleBinding( rule_binding_id=str(rule_binding_id).upper(), entity_id=entity_id, entity_uri=entity_uri, column_id=column_id, row_filter_id=row_filter_id, incremental_time_filter_column_id=incremental_time_filter_column_id, rule_ids=rule_ids, metadata=metadata, )