def ingest(config: str): """Main command for ingesting metadata into DataHub""" config_file = pathlib.Path(config) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}".format( config_file.suffix ) ) with config_file.open() as fp: pipeline_config = config_mech.load_config(fp) with nicely_formatted_validation_errors(): logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) pipeline.run()
def host_colon_port_comma(cls, host_val: str) -> str: for entry in host_val.split(","): # The port can be provided but is not required. port = None for prefix in ["http://", "https://"]: if entry.startswith(prefix): entry = entry[len(prefix):] for suffix in ["/"]: if entry.endswith(suffix): entry = entry[:-len(suffix)] if ":" in entry: (host, port) = entry.rsplit(":", 1) else: host = entry if not re.match( # This regex is quite loose. Many invalid hostnames or IPs will slip through, # but it serves as a good first line of validation. We defer to Elastic for the # remaining validation. r"^[\w\-\.]+$", host, ): raise ConfigurationError( f"host contains bad characters, found {host}") if port is not None and not port.isdigit(): raise ConfigurationError( f"port must be all digits, found {port}") return host_val
def ingest(config: str): """Main command for ingesting metadata into DataHub""" config_file = pathlib.Path(config) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}". format(config_file.suffix)) with config_file.open() as fp: pipeline_config = config_mech.load_config(fp) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) except ValidationError as e: click.echo(e, err=True) sys.exit(1) pipeline.run() ret = pipeline.pretty_print_summary() sys.exit(ret)
def validate(self, at_least_one: bool) -> bool: variables = re.findall("({[^}{]+})", self.pattern) self.variables = [v[1:-1] for v in variables] for v in variables: if v[1:-1] not in self.allowed_vars: raise ConfigurationError( f"Failed to find {v} in allowed_variables {self.allowed_vars}" ) if at_least_one and len(variables) == 0: raise ConfigurationError( f"Failed to find any variable assigned to pattern {self.pattern}. Must have at least one. Allowed variables are {self.allowed_vars}" ) return True
def web_service_url_scheme_host_port(cls, val: str) -> str: # Tokenize the web url url = urlparse(val) if url.scheme not in ["http", "https"]: raise ConfigurationError( f"Scheme should be http or https, found {url.scheme}" ) if not _is_valid_hostname(url.hostname.__str__()): raise ConfigurationError( f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}" ) return config_clean.remove_trailing_slashes(val)
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) self.source_config = config if ( self.is_stateful_ingestion_configured() and not self.source_config.platform_instance ): raise ConfigurationError( "Enabling kafka stateful ingestion requires to specify a platform instance." ) self.consumer = confluent_kafka.Consumer( { "group.id": "test", "bootstrap.servers": self.source_config.connection.bootstrap, **self.source_config.connection.consumer_config, } ) # Use the fully qualified name for SchemaRegistryClient to make it mock patchable for testing. self.schema_registry_client = ( confluent_kafka.schema_registry.schema_registry_client.SchemaRegistryClient( { "url": self.source_config.connection.schema_registry_url, **self.source_config.connection.schema_registry_config, } ) ) self.report = KafkaSourceReport() self.known_schema_registry_subjects: List[str] = [] try: self.known_schema_registry_subjects.extend( self.schema_registry_client.get_subjects() ) except Exception as e: logger.warning(f"Failed to get subjects from schema registry: {e}")
def __init__(self, config: AddDatasetOwnershipConfig, ctx: PipelineContext): self.ctx = ctx self.config = config if self.config.semantics == Semantics.PATCH and self.ctx.graph is None: raise ConfigurationError( "With PATCH semantics, AddDatasetOwnership requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" )
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> IngestionStateProvider: if ctx.graph: return cls(ctx.graph) elif config_dict is None: raise ConfigurationError("Missing provider configuration") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
def get_access_token(self): if self.__access_token != "": LOGGER.info("Returning the cached access token") return self.__access_token LOGGER.info("Generating PowerBi access token") auth_response = self.__msal_client.acquire_token_for_client( scopes=[self.__config.scope]) if not auth_response.get("access_token"): LOGGER.warn( "Failed to generate the PowerBi access token. Please check input configuration" ) raise ConfigurationError( "Powerbi authorization failed . Please check your input configuration." ) LOGGER.info("Generated PowerBi access token") self.__access_token = "Bearer {}".format( auth_response.get("access_token")) LOGGER.debug("{}={}".format(Constant.PBIAccessToken, self.__access_token)) return self.__access_token
def check_either_project_name_or_api_provided(cls, values): """Validate that we must either have a project name or an api credential to fetch project names""" if not values.get("project_name") and not values.get("api"): raise ConfigurationError( "Neither project_name not an API credential was found. LookML source requires either api credentials for Looker or a project_name to accurately name views and models." ) return values
def type_must_be_supported(cls, v: str) -> str: allowed_types = ["dataset"] if v not in allowed_types: raise ConfigurationError( f"Type must be one of {allowed_types}, {v} is not yet supported." ) return v
def _authenticate(self): # https://tableau.github.io/server-client-python/docs/api-ref#authentication authentication = None if self.config.username and self.config.password: authentication = TableauAuth( username=self.config.username, password=self.config.password, site_id=self.config.site, ) elif self.config.token_name and self.config.token_value: authentication = PersonalAccessTokenAuth(self.config.token_name, self.config.token_value, self.config.site) else: raise ConfigurationError( "Tableau Source: Either username/password or token_name/token_value must be set" ) try: self.server = Server(self.config.connect_uri, use_server_version=True) self.server.auth.sign_in(authentication) except ServerResponseError as e: logger.error(e) self.report.report_failure( key="tableau-login", reason=f"Unable to Login with credentials provided" f"Reason: {str(e)}", ) except Exception as e: logger.error(e) self.report.report_failure(key="tableau-login", reason=f"Unable to Login" f"Reason: {str(e)}")
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("enabled"): if values.get("state_provider") is None: raise ConfigurationError( "Must specify state_provider configuration if stateful ingestion is enabled." ) return values
def _initialize_state_provider(self) -> None: self.ingestion_state_provider: Optional[IngestionStateProvider] = None if ( self.stateful_ingestion_config is not None and self.stateful_ingestion_config.state_provider is not None and self.stateful_ingestion_config.enabled ): if self.ctx.pipeline_name is None: raise ConfigurationError( "pipeline_name must be provided if stateful ingestion is enabled." ) state_provider_class = ingestion_state_provider_registry.get( self.stateful_ingestion_config.state_provider.type ) self.ingestion_state_provider = state_provider_class.create( self.stateful_ingestion_config.state_provider.dict().get("config", {}), self.ctx, ) if self.stateful_ingestion_config.ignore_old_state: logger.warning( "The 'ignore_old_state' config is True. The old checkpoint state will not be provided." ) if self.stateful_ingestion_config.ignore_new_state: logger.warning( "The 'ignore_new_state' config is True. The new checkpoint state will not be created." ) logger.debug( f"Successfully created {self.stateful_ingestion_config.state_provider.type} state provider." )
def __init__(self, graph: DataHubGraph): self.graph = graph if not self._is_server_stateful_ingestion_capable(): raise ConfigurationError( "Datahub server is not capable of supporting stateful ingestion." " Please consider upgrading to the latest server version to use this feature." )
def platform_validator(cls, v: str) -> str: if not v or v in VALID_PLATFORMS: return v else: raise ConfigurationError( f"'platform' can only take following values: {VALID_PLATFORMS}" )
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = DataHubRestSinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, retry_status_codes=self.config.retry_status_codes, retry_max_times=self.config.retry_max_times, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) try: gms_config = self.emitter.test_connection() except Exception as exc: raise ConfigurationError( f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST", exc, ) self.report.gms_version = ( gms_config.get("versions", {}) .get("linkedin/datahub", {}) .get("version", "") ) logger.debug("Setting env variables to override config") set_env_variables_override_config(self.config.server, self.config.token) logger.debug("Setting gms config") set_gms_config(gms_config) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads )
def _initialize_checkpointing_state_provider(self) -> None: self.ingestion_checkpointing_state_provider: Optional[ IngestionCheckpointingProviderBase ] = None if ( self.stateful_ingestion_config is not None and self.stateful_ingestion_config.state_provider is not None and self.stateful_ingestion_config.enabled ): if self.ctx.pipeline_name is None: raise ConfigurationError( "pipeline_name must be provided if stateful ingestion is enabled." ) checkpointing_state_provider_class = ( ingestion_checkpoint_provider_registry.get( self.stateful_ingestion_config.state_provider.type ) ) if checkpointing_state_provider_class is None: raise ConfigurationError( f"Cannot find checkpoint provider class of type={self.stateful_ingestion_config.state_provider.type} " " in the registry! Please check the type of the checkpointing provider in your config." ) config_dict: Dict[str, Any] = cast( Dict[str, Any], self.stateful_ingestion_config.state_provider.dict().get("config", {}), ) self.ingestion_checkpointing_state_provider = checkpointing_state_provider_class.create( # type: ignore config_dict=config_dict, ctx=self.ctx, name=checkpointing_state_provider_class.__name__, ) assert self.ingestion_checkpointing_state_provider if self.stateful_ingestion_config.ignore_old_state: logger.warning( "The 'ignore_old_state' config is True. The old checkpoint state will not be provided." ) if self.stateful_ingestion_config.ignore_new_state: logger.warning( "The 'ignore_new_state' config is True. The new checkpoint state will not be created." ) # Add the checkpoint state provide to the platform context. self.ctx.register_checkpointer(self.ingestion_checkpointing_state_provider) logger.debug( f"Successfully created {self.stateful_ingestion_config.state_provider.type} state provider." )
def check_either_connection_map_or_connection_provided(cls, values): """Validate that we must either have a connection map or an api credential""" if not values.get("connection_to_platform_map", {}) and not values.get( "api", {}): raise ConfigurationError( "Neither api not connection_to_platform_map config was found. LookML source requires either api credentials for Looker or a map of connection names to platform identifiers to work correctly" ) return values
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext, name: str) -> IngestionCheckpointingProviderBase: if ctx.graph: # Use the pipeline-level graph if set return cls(ctx.graph, name) elif config_dict is None: raise ConfigurationError("Missing provider configuration.") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph, name) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
def ensure_only_issuer_or_token( cls, token: Optional[str], values: Dict[str, Optional[str]] ) -> Optional[str]: if token is not None and values.get("issuer_url") is not None: raise ConfigurationError( "Expected only one authentication method, either issuer_url or token." ) return token
def get_workunits(self) -> Iterable[MetadataWorkUnit]: if self.config.write_semantics == "PATCH" and not self.ctx.graph: raise ConfigurationError( "With PATCH semantics, dbt source requires a datahub_api to connect to. " "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe." ) ( nodes, manifest_schema, manifest_version, catalog_schema, catalog_version, manifest_nodes_raw, ) = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, self.config.sources_path, self.config.load_schemas, self.config.use_identifiers, self.config.tag_prefix, self.config.node_type_pattern, self.report, self.config.node_name_pattern, ) additional_custom_props = { "manifest_schema": manifest_schema, "manifest_version": manifest_version, "catalog_schema": catalog_schema, "catalog_version": catalog_version, } additional_custom_props_filtered = { key: value for key, value in additional_custom_props.items() if value is not None } if not self.config.disable_dbt_node_creation: yield from self.create_platform_mces( nodes, additional_custom_props_filtered, manifest_nodes_raw, DBT_PLATFORM, ) yield from self.create_platform_mces( nodes, additional_custom_props_filtered, manifest_nodes_raw, self.config.target_platform, ) if self.is_stateful_ingestion_configured(): # Clean up stale entities. yield from self.gen_removed_entity_workunits()
def validate_that_bigquery_audit_metadata_datasets_is_correctly_configured( cls, values: Dict[str, Any]) -> Dict[str, Any]: if (values.get("use_exported_bigquery_audit_metadata") and not values.get("use_v2_audit_metadata") and not values.get("bigquery_audit_metadata_datasets")): raise ConfigurationError( "bigquery_audit_metadata_datasets must be specified if using exported audit metadata. Otherwise set use_v2_audit_metadata to True." ) pass return values
def env_must_be_one_of(cls, v: str) -> str: # Get all the constants from the FabricTypeClass. It's not an enum, so this is a bit hacky but works allowed_envs = [ value for name, value in vars(FabricTypeClass).items() if not name.startswith("_") ] if (v.upper()) not in allowed_envs: raise ConfigurationError( f"env must be one of {allowed_envs}, found {v}") return v.upper()
def ensure_client_id_and_secret_for_issuer_url( cls, client_secret: Optional[str], values: Dict[str, Optional[str]] ) -> Optional[str]: if values.get("issuer_url") is not None and ( client_secret is None or values.get("client_id") is None ): raise ConfigurationError( "Missing configuration: client_id and client_secret are mandatory when issuer_url is set." ) return client_secret
def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str): super().__init__(ctx) self.config = config self.platform = platform self.report = SQLSourceReport() if self.config.profiling.enabled and not self._can_run_profiler(): raise ConfigurationError( "Table profiles requested but profiler plugin is not enabled. " f"Try running: pip install '{__package_name__}[sql-profiles]'")
def get(self, key: str) -> Type[T]: if key.find(".") >= 0: # If the key contains a dot, we treat it as a import path and attempt # to load it dynamically. MyClass = import_key(key) self._check_cls(MyClass) return MyClass if key not in self._mapping: raise KeyError(f"Did not find a registered class for {key}") tp = self._mapping[key] if isinstance(tp, ModuleNotFoundError): raise ConfigurationError( f"{key} is disabled; try running: pip install '{__package_name__}[{key}]'" ) from tp elif isinstance(tp, Exception): raise ConfigurationError( f"{key} is disabled due to an error in initialization") from tp else: # If it's not an exception, then it's a registered type. return tp
def from_looker_connection( cls, looker_connection: DBConnection ) -> "LookerConnectionDefinition": """Dialect definitions are here: https://docs.looker.com/setup-and-management/database-config""" extractors: Dict[str, Any] = { "^bigquery": _get_bigquery_definition, ".*": _get_generic_definition, } if looker_connection.dialect_name is not None: for extractor_pattern, extracting_function in extractors.items(): if re.match(extractor_pattern, looker_connection.dialect_name): (platform, db, schema) = extracting_function(looker_connection) return cls(platform=platform, default_db=db, default_schema=schema) raise ConfigurationError( f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" ) else: raise ConfigurationError( f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions." )
def load_config_file(config_file: Union[pathlib.Path, str]) -> dict: if isinstance(config_file, str): config_file = pathlib.Path(config_file) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config_file}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}". format(config_file.suffix)) with config_file.open() as raw_config_fp: raw_config_file = raw_config_fp.read() config_fp = io.StringIO(raw_config_file) config = config_mech.load_config(config_fp) resolve_env_variables(config) return config
def test_connection(self) -> dict: response = self._session.get(f"{self._gms_server}/config") if response.status_code == 200: config: dict = response.json() if config.get("noCode") == "true": return config else: # Looks like we either connected to an old GMS or to some other service. Let's see if we can determine which before raising an error # A common misconfiguration is connecting to datahub-frontend so we special-case this check if (config.get("config", {}).get("application") == "datahub-frontend" or config.get( "config", {}).get("shouldShowDatasetLineage") is not None): message = "You seem to have connected to the frontend instead of the GMS endpoint. The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)" else: message = "You have either connected to a pre-v0.8.0 DataHub GMS instance, or to a different server altogether! Please check your configuration and make sure you are talking to the DataHub GMS endpoint." raise ConfigurationError(message) else: auth_message = "Maybe you need to set up authentication? " message = f"Unable to connect to {self._gms_server}/config with status_code: {response.status_code}. {auth_message if response.status_code == 401 else ''}Please check your configuration and make sure you are talking to the DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)." raise ConfigurationError(message)