def _resolve_dataplex_entity_uri( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, bigquery_client: BigQueryClient, ) -> dq_entity.DqEntity: dataplex_entity = dataplex_client.get_dataplex_entity( gcp_project_id=entity_uri.get_configs("projects"), location_id=entity_uri.get_configs("locations"), lake_name=entity_uri.get_configs("lakes"), zone_id=entity_uri.get_configs("zones"), entity_id=entity_uri.get_entity_id(), ) clouddq_entity = dq_entity.DqEntity.from_dataplex_entity( entity_id=entity_uri.get_db_primary_key(), dataplex_entity=dataplex_entity, ) entity_uri_primary_key = entity_uri.get_db_primary_key().upper() gcs_entity_external_table_name = clouddq_entity.get_table_name() logger.debug( f"GCS Entity External Table Name is {gcs_entity_external_table_name}" ) bq_table_exists = bigquery_client.is_table_exists( table=gcs_entity_external_table_name, project_id=clouddq_entity.instance_name, ) if bq_table_exists: logger.debug( f"The External Table {gcs_entity_external_table_name} for Entity URI " f"{entity_uri_primary_key} exists in Bigquery.") else: raise RuntimeError( f"Unable to find Bigquery External Table {gcs_entity_external_table_name} " f"for Entity URI {entity_uri_primary_key}") return clouddq_entity
def from_bq_entity_uri(self, entity_uri: EntityUri, bigquery_client: BigQueryClient) -> DqEntity: project_id = entity_uri.get_configs("projects") table_name = entity_uri.get_table_name() configs = entity_uri.configs_dict entity_id = entity_uri.get_entity_id() columns_dict = bigquery_client.get_table_schema(table=table_name, project_id=project_id) entity_configs = { "source_database": "BIGQUERY", "resource_type": "BIGQUERY", "table_name": configs.get("tables"), "dataset_name": configs.get("datasets"), "project_name": configs.get("projects"), "columns": columns_dict.get("columns"), "environment_override": {}, "entity_id": entity_id, "dataplex_name": None, "dataplex_lake": None, "dataplex_zone": None, "dataplex_location": None, "dataplex_asset_id": None, "dataplex_createTime": None, "dataplex_updateTime": None, "partition_fields": columns_dict.get("partition_fields"), } return DqEntity.from_dict(entity_id=entity_id.upper(), kwargs=entity_configs)
def is_dataplex_entity( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, ): required_arguments = ["projects", "lakes", "locations", "zones"] for argument in required_arguments: uri_argument = entity_uri.get_configs(argument) if not uri_argument: logger.info( f"Failed to retrieve default Dataplex '{argument}' for " f"entity_uri: {entity_uri.complete_uri_string}. \n" f"'{argument}' is a required argument to look-up metadata for the entity_uri " "using Dataplex Metadata API.\n" "Ensure the BigQuery dataset containing this table " "is attached as an asset in Dataplex.\n" "You can then specify the corresponding Dataplex " "projects/locations/lakes/zones as part of the " "metadata_default_registries YAML configs, e.g.\n" f"{SAMPLE_DEFAULT_REGISTRIES_YAML}") return False dataplex_entities_match = dataplex_client.list_dataplex_entities( gcp_project_id=entity_uri.get_configs("projects"), location_id=entity_uri.get_configs("locations"), lake_name=entity_uri.get_configs("lakes"), zone_id=entity_uri.get_configs("zones"), data_path=entity_uri.get_entity_id(), ) logger.info( f"Retrieved Dataplex Entities:\n{pformat(dataplex_entities_match)}" ) if len(dataplex_entities_match) != 1: logger.info("Failed to retrieve Dataplex Metadata entry for " f"entity_uri '{entity_uri.complete_uri_string}' " f"with error:\n" f"{pformat(json.dumps(dataplex_entities_match))}\n\n" f"Parsed entity_uri configs:\n" f"{pformat(entity_uri.to_dict())}\n\n") return False else: dataplex_entity = dataplex_entities_match[0] clouddq_entity = dq_entity.DqEntity.from_dataplex_entity( entity_id=entity_uri.get_db_primary_key(), dataplex_entity=dataplex_entity, ) return clouddq_entity
def _resolve_bigquery_entity_uri( self, entity_uri: dq_entity_uri.EntityUri, dataplex_client: clouddq_dataplex.CloudDqDataplexClient, bigquery_client: BigQueryClient, ) -> dq_entity.DqEntity: required_arguments = ["projects", "datasets", "tables"] for argument in required_arguments: uri_argument = entity_uri.get_configs(argument) if not uri_argument: raise RuntimeError( f"Failed to retrieve default Bigquery '{argument}' for " f"entity_uri: {entity_uri.complete_uri_string}. \n" f"'{argument}' is a required argument to look-up metadata for the entity_uri " "using Bigquery API.\n") project_id = entity_uri.get_configs("projects") table_name = entity_uri.get_table_name() bq_table_exists = bigquery_client.is_table_exists( table=table_name, project_id=project_id) if bq_table_exists: logger.debug(f"The Table '{table_name}' in the " f"specified entity_uri '{entity_uri}' " f"exists in Bigquery.") dataplex_entity = self.is_dataplex_entity( entity_uri=entity_uri, dataplex_client=dataplex_client) if dataplex_entity: clouddq_entity = dataplex_entity else: clouddq_entity = dq_entity.DqEntity.from_bq_entity_uri( entity_uri=entity_uri, bigquery_client=bigquery_client, ) return clouddq_entity else: raise RuntimeError( f"Bigquery Table '{table_name}' specified in the " f"entity uri '{entity_uri}' does not exist")