def test_entity_uri_parse_glob_failure(self):
     """ """
     entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \
                  "/lake-id/zones/zone-id/entities/test_entity_*"
     # This should be supported eventually
     with pytest.raises(NotImplementedError):
         EntityUri.from_uri(entity_uri)
    def test_entity_uri_parse_asset_id_failure(self):
        """ """
        entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \
                     "/lake-id/zones/zone-id/assets/asset-id"

        with pytest.raises(ValueError):
            EntityUri.from_uri(entity_uri)
Ejemplo n.º 3
0
    def from_bq_entity_uri(self, entity_uri: EntityUri,
                           bigquery_client: BigQueryClient) -> DqEntity:
        project_id = entity_uri.get_configs("projects")
        table_name = entity_uri.get_table_name()
        configs = entity_uri.configs_dict
        entity_id = entity_uri.get_entity_id()
        columns_dict = bigquery_client.get_table_schema(table=table_name,
                                                        project_id=project_id)
        entity_configs = {
            "source_database": "BIGQUERY",
            "resource_type": "BIGQUERY",
            "table_name": configs.get("tables"),
            "dataset_name": configs.get("datasets"),
            "project_name": configs.get("projects"),
            "columns": columns_dict.get("columns"),
            "environment_override": {},
            "entity_id": entity_id,
            "dataplex_name": None,
            "dataplex_lake": None,
            "dataplex_zone": None,
            "dataplex_location": None,
            "dataplex_asset_id": None,
            "dataplex_createTime": None,
            "dataplex_updateTime": None,
            "partition_fields": columns_dict.get("partition_fields"),
        }

        return DqEntity.from_dict(entity_id=entity_id.upper(),
                                  kwargs=entity_configs)
 def test_entity_uri_parse_dataplex_uri_without_default_configs(self):
     """ """
     entity_uri = "dataplex://projects/project-id/locations/us-central1/lakes" \
                  "/lake-id/zones/zone-id/entities/entity-id"
     parsed_uri = EntityUri.from_uri(entity_uri)
     expected_entity_dict = {
         "uri": "dataplex://projects/project-id/locations/us-central1/lakes"
         "/lake-id/zones/zone-id/entities/entity-id",
         "scheme": "DATAPLEX",
         "entity_id": "entity-id",
         "db_primary_key":
         "projects/project-id/locations/us-central1/lakes/lake-id/zones/zone-id/entities/entity-id",  # noqa: E501
         "configs": {
             "projects": "project-id",
             "locations": "us-central1",
             "lakes": "lake-id",
             "zones": "zone-id",
             "entities": "entity-id",
         }
     }
     assert parsed_uri.scheme == expected_entity_dict["scheme"]
     assert parsed_uri.uri_configs_string == (
         "projects/project-id/locations/us-central1/lakes"
         "/lake-id/zones/zone-id/entities/entity-id")
     assert parsed_uri.default_configs is None
     assert parsed_uri.complete_uri_string == expected_entity_dict["uri"]
     assert parsed_uri.get_entity_id() == expected_entity_dict["entity_id"]
     assert parsed_uri.configs_dict == expected_entity_dict["configs"]
     assert parsed_uri.get_db_primary_key(
     ) == expected_entity_dict["db_primary_key"]
     assert parsed_uri.to_dict() == expected_entity_dict
 def test_entity_uri_parse_bigquery_uri_without_default_configs(self):
     """ """
     bigquery_uri = "bigquery://projects/project-id/datasets/dataset-id/tables/table-id"
     parsed_uri = EntityUri.from_uri(bigquery_uri)
     print(parsed_uri)
     expected_entity_dict = {
         "uri":
         "bigquery://projects/project-id/datasets/dataset-id/tables/table-id",
         "scheme": "BIGQUERY",
         "entity_id":
         "projects/project-id/datasets/dataset-id/tables/table-id",
         "db_primary_key":
         "projects/project-id/datasets/dataset-id/tables/table-id",
         "configs": {
             "projects": "project-id",
             "datasets": "dataset-id",
             "tables": "table-id",
         }
     }
     assert parsed_uri.scheme == expected_entity_dict["scheme"]
     assert parsed_uri.uri_configs_string == "projects/project-id/datasets/dataset-id/tables/table-id"
     assert parsed_uri.default_configs is None
     assert parsed_uri.complete_uri_string == expected_entity_dict["uri"]
     assert parsed_uri.get_entity_id() == expected_entity_dict["entity_id"]
     assert parsed_uri.get_db_primary_key(
     ) == expected_entity_dict["db_primary_key"]
     assert parsed_uri.configs_dict == expected_entity_dict["configs"]
     assert parsed_uri.to_dict() == expected_entity_dict
 def test_entity_uri_parse_override_project_lake_id_failure(self):
     """ """
     entity_uri = "dataplex://projects/project-id-2/zones/zone-id/entities/entity-id"
     default_configs = {
         "projects": "project-id-1",
         "locations": "us-central1",
         "lakes": "lake-id",
     }
     # This should fail without metadata_defaults
     with pytest.raises(ValueError):
         EntityUri.from_uri(entity_uri)
     parsed_uri = EntityUri.from_uri(uri_string=entity_uri,
                                     default_configs=default_configs)
     assert parsed_uri.complete_uri_string == entity_uri
     assert parsed_uri.get_db_primary_key() == (
         "projects/project-id-2/locations/us-central1/lakes"
         "/lake-id/zones/zone-id/entities/entity-id")
Ejemplo n.º 7
0
 def _resolve_dataplex_entity_uri(
     self,
     entity_uri: dq_entity_uri.EntityUri,
     dataplex_client: clouddq_dataplex.CloudDqDataplexClient,
     bigquery_client: BigQueryClient,
 ) -> dq_entity.DqEntity:
     dataplex_entity = dataplex_client.get_dataplex_entity(
         gcp_project_id=entity_uri.get_configs("projects"),
         location_id=entity_uri.get_configs("locations"),
         lake_name=entity_uri.get_configs("lakes"),
         zone_id=entity_uri.get_configs("zones"),
         entity_id=entity_uri.get_entity_id(),
     )
     clouddq_entity = dq_entity.DqEntity.from_dataplex_entity(
         entity_id=entity_uri.get_db_primary_key(),
         dataplex_entity=dataplex_entity,
     )
     entity_uri_primary_key = entity_uri.get_db_primary_key().upper()
     gcs_entity_external_table_name = clouddq_entity.get_table_name()
     logger.debug(
         f"GCS Entity External Table Name is {gcs_entity_external_table_name}"
     )
     bq_table_exists = bigquery_client.is_table_exists(
         table=gcs_entity_external_table_name,
         project_id=clouddq_entity.instance_name,
     )
     if bq_table_exists:
         logger.debug(
             f"The External Table {gcs_entity_external_table_name} for Entity URI "
             f"{entity_uri_primary_key} exists in Bigquery.")
     else:
         raise RuntimeError(
             f"Unable to find Bigquery External Table  {gcs_entity_external_table_name} "
             f"for Entity URI {entity_uri_primary_key}")
     return clouddq_entity
Ejemplo n.º 8
0
    def _resolve_bigquery_entity_uri(
        self,
        entity_uri: dq_entity_uri.EntityUri,
        dataplex_client: clouddq_dataplex.CloudDqDataplexClient,
        bigquery_client: BigQueryClient,
    ) -> dq_entity.DqEntity:
        required_arguments = ["projects", "datasets", "tables"]
        for argument in required_arguments:
            uri_argument = entity_uri.get_configs(argument)
            if not uri_argument:
                raise RuntimeError(
                    f"Failed to retrieve default Bigquery '{argument}' for "
                    f"entity_uri: {entity_uri.complete_uri_string}. \n"
                    f"'{argument}' is a required argument to look-up metadata for the entity_uri "
                    "using Bigquery API.\n")

        project_id = entity_uri.get_configs("projects")
        table_name = entity_uri.get_table_name()
        bq_table_exists = bigquery_client.is_table_exists(
            table=table_name, project_id=project_id)
        if bq_table_exists:
            logger.debug(f"The Table '{table_name}' in the "
                         f"specified entity_uri '{entity_uri}' "
                         f"exists in Bigquery.")
            dataplex_entity = self.is_dataplex_entity(
                entity_uri=entity_uri, dataplex_client=dataplex_client)
            if dataplex_entity:
                clouddq_entity = dataplex_entity
            else:
                clouddq_entity = dq_entity.DqEntity.from_bq_entity_uri(
                    entity_uri=entity_uri,
                    bigquery_client=bigquery_client,
                )
            return clouddq_entity
        else:
            raise RuntimeError(
                f"Bigquery Table '{table_name}' specified in the "
                f"entity uri '{entity_uri}' does not exist")
Ejemplo n.º 9
0
 def is_dataplex_entity(
     self,
     entity_uri: dq_entity_uri.EntityUri,
     dataplex_client: clouddq_dataplex.CloudDqDataplexClient,
 ):
     required_arguments = ["projects", "lakes", "locations", "zones"]
     for argument in required_arguments:
         uri_argument = entity_uri.get_configs(argument)
         if not uri_argument:
             logger.info(
                 f"Failed to retrieve default Dataplex '{argument}' for "
                 f"entity_uri: {entity_uri.complete_uri_string}. \n"
                 f"'{argument}' is a required argument to look-up metadata for the entity_uri "
                 "using Dataplex Metadata API.\n"
                 "Ensure the BigQuery dataset containing this table "
                 "is attached as an asset in Dataplex.\n"
                 "You can then specify the corresponding Dataplex "
                 "projects/locations/lakes/zones as part of the "
                 "metadata_default_registries YAML configs, e.g.\n"
                 f"{SAMPLE_DEFAULT_REGISTRIES_YAML}")
             return False
     dataplex_entities_match = dataplex_client.list_dataplex_entities(
         gcp_project_id=entity_uri.get_configs("projects"),
         location_id=entity_uri.get_configs("locations"),
         lake_name=entity_uri.get_configs("lakes"),
         zone_id=entity_uri.get_configs("zones"),
         data_path=entity_uri.get_entity_id(),
     )
     logger.info(
         f"Retrieved Dataplex Entities:\n{pformat(dataplex_entities_match)}"
     )
     if len(dataplex_entities_match) != 1:
         logger.info("Failed to retrieve Dataplex Metadata entry for "
                     f"entity_uri '{entity_uri.complete_uri_string}' "
                     f"with error:\n"
                     f"{pformat(json.dumps(dataplex_entities_match))}\n\n"
                     f"Parsed entity_uri configs:\n"
                     f"{pformat(entity_uri.to_dict())}\n\n")
         return False
     else:
         dataplex_entity = dataplex_entities_match[0]
         clouddq_entity = dq_entity.DqEntity.from_dataplex_entity(
             entity_id=entity_uri.get_db_primary_key(),
             dataplex_entity=dataplex_entity,
         )
     return clouddq_entity
    def test_dq_entity_parse_bigquery_uri(self,
        gcp_project_id,
        test_dataplex_metadata_defaults_configs,
        test_bigquery_client,):

        bq_entity_uri_string = f"bigquery://projects/{gcp_project_id}/datasets/" \
                               f"austin_311/tables/contact_details_partitioned"
        bq_entity_uri = EntityUri.from_uri(
            uri_string=bq_entity_uri_string,
            default_configs=test_dataplex_metadata_defaults_configs,
        )
        clouddq_entity = DqEntity.from_bq_entity_uri(
            entity_uri=bq_entity_uri,
            bigquery_client=test_bigquery_client)
        clouddq_entity_expected_dict = {bq_entity_uri_string.upper().split("://")[1]: {
            'source_database': 'BIGQUERY',
            'table_name': 'contact_details_partitioned',
            'database_name': 'austin_311',
            'instance_name': f'{gcp_project_id}',
            'columns': {
                'ROW_ID': {'name': 'row_id', 'data_type': 'STRING'},
                'CONTACT_TYPE': {'name': 'contact_type', 'data_type': 'STRING'},
                'VALUE': {'name': 'value', 'data_type': 'STRING'},
                'TS': {'name': 'ts', 'data_type': 'TIMESTAMP'}
            },
            'resource_type': 'BIGQUERY',
            'partition_fields': [{
                'name': 'ts',
                'type': 'TIMESTAMP',
                'partitioning_type': 'DAY'
            }],
            'dataset_name': 'austin_311',
            'project_name': f'{gcp_project_id}'
        }}

        assert clouddq_entity.to_dict() == clouddq_entity_expected_dict
 def test_entity_uri_parse_failure(self, entity_uri, error_type):
     """ """
     with pytest.raises(error_type):
         EntityUri.from_uri(entity_uri)
    def from_dict(
        cls: DqRuleBinding,
        rule_binding_id: str,
        kwargs: dict,
        default_configs: dict | None = None,
        validate_uri: bool = True,
    ) -> DqRuleBinding:
        """

        Args:
          cls: DqRuleBinding:
          rule_binding_id: typing.Union[str, str]:
          kwargs: typing.Dict:

        Returns:

        """
        entity_config: dict = get_keys_from_dict_and_assert_oneof(
            config_id=rule_binding_id,
            kwargs=kwargs,
            keys=["entity_uri", "entity_id"],
        )
        if "entity_id" in entity_config:
            entity_id = entity_config["entity_id"]
            entity_uri = None
        if "entity_uri" in entity_config:
            parsed_entity_uri = EntityUri.from_uri(
                entity_config["entity_uri"],
                default_configs=default_configs,
                validate_uri=validate_uri,
            )
            entity_id = parsed_entity_uri.get_entity_id()
            entity_uri = parsed_entity_uri
        if entity_id:
            entity_id.upper()
        column_id: str = get_from_dict_and_assert(
            config_id=rule_binding_id,
            kwargs=kwargs,
            key="column_id",
        )
        if column_id:
            column_id.upper()
        row_filter_id: str = get_from_dict_and_assert(
            config_id=rule_binding_id,
            kwargs=kwargs,
            key="row_filter_id",
        )
        if row_filter_id:
            row_filter_id.upper()
        rule_ids: list[str] = get_from_dict_and_assert(
            config_id=rule_binding_id,
            kwargs=kwargs,
            key="rule_ids",
            assertion=lambda x: type(x) == list,
            error_msg=
            f"Rule Binding ID: '{rule_binding_id}' must have defined value "
            f"'rule_ids' of type 'list'.",
        )
        incremental_time_filter_column_id: str | None = kwargs.get(
            "incremental_time_filter_column_id", None)
        if incremental_time_filter_column_id:
            incremental_time_filter_column_id.upper()
        metadata: dict | None = kwargs.get("metadata", dict())
        if type(metadata) != dict:
            raise ValueError(
                f"Rule Binding ID: '{rule_binding_id}' has invalid "
                f"metadata field with type {type(metadata)} and values: {metadata}\n"
                "'metadata' must be of type dictionary.")
        return DqRuleBinding(
            rule_binding_id=str(rule_binding_id).upper(),
            entity_id=entity_id,
            entity_uri=entity_uri,
            column_id=column_id,
            row_filter_id=row_filter_id,
            incremental_time_filter_column_id=incremental_time_filter_column_id,
            rule_ids=rule_ids,
            metadata=metadata,
        )