Ejemplo n.º 1
0
class AzureADConfig(ConfigModel):
    """Config to create a token and connect to Azure AD instance"""

    # Required
    client_id: str
    tenant_id: str
    client_secret: str
    redirect: str
    authority: str
    token_url: str
    graph_url: str

    # Optional: Customize the mapping to DataHub Username from an attribute in the REST API response
    # Reference: https://docs.microsoft.com/en-us/graph/api/user-list?view=graph-rest-1.0&tabs=http#response-1
    azure_ad_response_to_username_attr: str = "mail"
    azure_ad_response_to_username_regex: str = "([^@]+)"

    # Optional: Customize the mapping to DataHub Groupname from an attribute in the REST API response
    # Reference: https://docs.microsoft.com/en-us/graph/api/group-list?view=graph-rest-1.0&tabs=http#response-1
    azure_ad_response_to_groupname_attr: str = "displayName"
    azure_ad_response_to_groupname_regex: str = "(.*)"

    # Optional: to ingest users, groups or both
    ingest_users: bool = True
    ingest_groups: bool = True
    ingest_group_membership: bool = True

    ingest_groups_users: bool = True
    users_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    groups_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
Ejemplo n.º 2
0
class SQLAlchemyConfig(ConfigModel):
    env: str = DEFAULT_ENV
    options: dict = {}
    # Although the 'table_pattern' enables you to skip everything from certain schemas,
    # having another option to allow/deny on schema level is an optimization for the case when there is a large number
    # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter
    # them out afterwards via the table_pattern.
    schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    include_views: Optional[bool] = True
    include_tables: Optional[bool] = True

    @abstractmethod
    def get_sql_alchemy_url(self):
        pass

    def get_identifier(self, schema: str, table: str) -> str:
        return f"{schema}.{table}"

    def standardize_schema_table_names(self, schema: str,
                                       entity: str) -> Tuple[str, str]:
        # Some SQLAlchemy dialects need a standardization step to clean the schema
        # and table names. See BigQuery for an example of when this is useful.
        return schema, entity
Ejemplo n.º 3
0
class AwsSourceConfig(ConfigModel):
    """
    Common AWS credentials config.

    Currently used by:
        - Glue source
        - SageMaker source
    """

    env: str = DEFAULT_ENV

    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    aws_access_key_id: Optional[str] = None
    aws_secret_access_key: Optional[str] = None
    aws_session_token: Optional[str] = None
    aws_role: Optional[Union[str, List[str]]] = None
    aws_region: str

    def get_client(self, service: str) -> boto3.client:
        if (
            self.aws_access_key_id
            and self.aws_secret_access_key
            and self.aws_session_token
        ):
            return boto3.client(
                service,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                aws_session_token=self.aws_session_token,
                region_name=self.aws_region,
            )
        elif self.aws_access_key_id and self.aws_secret_access_key:
            return boto3.client(
                service,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                region_name=self.aws_region,
            )
        elif self.aws_role:
            if isinstance(self.aws_role, str):
                credentials = assume_role(self.aws_role, self.aws_region)
            else:
                credentials = reduce(
                    lambda new_credentials, role_arn: assume_role(
                        role_arn, self.aws_region, new_credentials
                    ),
                    self.aws_role,
                    {},
                )
            return boto3.client(
                service,
                aws_access_key_id=credentials["AccessKeyId"],
                aws_secret_access_key=credentials["SecretAccessKey"],
                aws_session_token=credentials["SessionToken"],
                region_name=self.aws_region,
            )
        else:
            return boto3.client(service, region_name=self.aws_region)
Ejemplo n.º 4
0
class MongoDBConfig(ConfigModel):
    # See the MongoDB authentication docs for details and examples.
    # https://pymongo.readthedocs.io/en/stable/examples/authentication.html
    connect_uri: str = "mongodb://localhost"
    username: Optional[str] = None
    password: Optional[str] = None
    authMechanism: Optional[str] = None
    options: dict = {}
    enableSchemaInference: bool = True
    schemaSamplingSize: Optional[PositiveInt] = 1000
    useRandomSampling: bool = True
    maxSchemaSize: Optional[PositiveInt] = 300
    # mongodb only supports 16MB as max size for documents. However, if we try to retrieve a larger document it
    # errors out with "16793600" as the maximum size supported.
    maxDocumentSize: Optional[PositiveInt] = 16793600
    env: str = DEFAULT_ENV

    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    @validator("maxDocumentSize")
    def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value):
        if doc_size_filter_value > 16793600:
            raise ValueError(
                "maxDocumentSize must be a positive value <= 16793600.")
        return doc_size_filter_value
Ejemplo n.º 5
0
class DBTConfig(ConfigModel):
    manifest_path: str
    catalog_path: str
    sources_path: Optional[str]
    env: str = DEFAULT_ENV
    target_platform: str
    load_schemas: bool = True
    use_identifiers: bool = False
    node_type_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    tag_prefix: str = f"{DBT_PLATFORM}:"
    node_name_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    disable_dbt_node_creation = False
    meta_mapping: Dict = {}
    enable_meta_mapping = True
    write_semantics: str = "PATCH"
    strip_user_ids_from_email: bool = False

    @validator("target_platform")
    def validate_target_platform_value(cls, target_platform: str) -> str:
        if target_platform.lower() == DBT_PLATFORM:
            raise ValueError(
                "target_platform cannot be dbt. It should be the platform which dbt is operating on top of. For e.g "
                "postgres.")
        return target_platform

    @validator("write_semantics")
    def validate_write_semantics(cls, write_semantics: str) -> str:
        if write_semantics.lower() not in {"patch", "override"}:
            raise ValueError(
                "write_semantics cannot be any other value than PATCH or OVERRIDE. Default value is PATCH. "
                "For PATCH semantics consider using the datahub-rest sink or "
                "provide a datahub_api: configuration on your ingestion recipe"
            )
        return write_semantics
Ejemplo n.º 6
0
class SnowflakeUsageConfig(BaseSnowflakeConfig, BaseUsageConfig,
                           StatefulIngestionConfigBase):
    env: str = builder.DEFAULT_ENV
    options: dict = {}
    database_pattern: AllowDenyPattern = AllowDenyPattern(
        deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"])
    email_domain: Optional[str]
    schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    apply_view_usage_to_tables: bool = False
    stateful_ingestion: Optional[SnowflakeStatefulIngestionConfig] = None

    @pydantic.validator("role", always=True)
    def role_accountadmin(cls, v):
        if not v or v.lower() != "accountadmin":
            # This isn't an error, since the privileges can be delegated to other
            # roles as well: https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles
            logger.info(
                'snowflake usage tables are only accessible by role "accountadmin" by default; you set %s',
                v,
            )
        return v

    def get_sql_alchemy_url(self):
        return super().get_sql_alchemy_url(database="snowflake")
Ejemplo n.º 7
0
class GlueSourceConfig(ConfigModel):
    env: str = "PROD"
    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    aws_access_key_id: Optional[str] = None
    aws_secret_access_key: Optional[str] = None
    aws_session_token: Optional[str] = None
    aws_region: str

    @property
    def glue_client(self):
        if (self.aws_access_key_id and self.aws_secret_access_key
                and self.aws_session_token):
            return boto3.client(
                "glue",
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                aws_session_token=self.aws_session_token,
                region_name=self.aws_region,
            )
        elif self.aws_access_key_id and self.aws_secret_access_key:
            return boto3.client(
                "glue",
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                region_name=self.aws_region,
            )
        else:
            return boto3.client("glue", region_name=self.aws_region)
Ejemplo n.º 8
0
class RedashConfig(ConfigModel):
    # See the Redash API for details
    # https://redash.io/help/user-guide/integrations-and-api/api
    connect_uri: str = Field(default="http://localhost:5000",
                             description="Redash base URL.")
    api_key: str = Field(default="REDASH_API_KEY",
                         description="Redash user API key.")

    # Optionals
    dashboard_patterns: AllowDenyPattern = Field(
        default=AllowDenyPattern.allow_all(),
        description="regex patterns for dashboards to filter for ingestion.",
    )
    chart_patterns: AllowDenyPattern = Field(
        default=AllowDenyPattern.allow_all(),
        description="regex patterns for charts to filter for ingestion.",
    )
    skip_draft: bool = Field(
        default=True,
        description="Only ingest published dashboards and charts.")
    api_page_limit: int = Field(
        default=sys.maxsize,
        description="Limit on ingested dashboards and charts API pagination.",
    )
    parse_table_names_from_sql: bool = Field(default=False,
                                             description="See note below.")
    sql_parser: str = Field(
        default="datahub.utilities.sql_parser.DefaultSQLParser",
        description="custom SQL parser. See note below for details.",
    )

    env: str = Field(
        default=DEFAULT_ENV,
        description="Environment to use in namespace when constructing URNs.",
    )
Ejemplo n.º 9
0
class SQLAlchemyConfig(StatefulIngestionConfigBase):
    env: str = DEFAULT_ENV
    options: dict = {}
    # Although the 'table_pattern' enables you to skip everything from certain schemas,
    # having another option to allow/deny on schema level is an optimization for the case when there is a large number
    # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter
    # them out afterwards via the table_pattern.
    schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    profile_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    include_views: Optional[bool] = True
    include_tables: Optional[bool] = True

    from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig

    profiling: GEProfilingConfig = GEProfilingConfig()
    # Custom Stateful Ingestion settings
    stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None

    @pydantic.root_validator()
    def ensure_profiling_pattern_is_passed_to_profiling(
            cls, values: Dict[str, Any]) -> Dict[str, Any]:
        profiling = values.get("profiling")
        if profiling is not None and profiling.enabled:
            profiling.allow_deny_patterns = values["profile_pattern"]
        return values

    @abstractmethod
    def get_sql_alchemy_url(self):
        pass
Ejemplo n.º 10
0
class LookMLSourceConfig(ConfigModel):
    base_folder: pydantic.DirectoryPath
    connection_to_platform_map: Dict[str, str]
    platform_name: str = "looker"
    model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    env: str = builder.DEFAULT_ENV
    parse_table_names_from_sql: bool = False
Ejemplo n.º 11
0
class GlueSourceConfig(ConfigModel):
    env: str = "PROD"

    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    extract_transforms: Optional[bool] = True

    aws_access_key_id: Optional[str] = None
    aws_secret_access_key: Optional[str] = None
    aws_session_token: Optional[str] = None
    aws_role: Optional[Union[str, List[str]]] = None
    aws_region: str

    def get_client(self, service: str) -> boto3.client:
        if (self.aws_access_key_id and self.aws_secret_access_key
                and self.aws_session_token):
            return boto3.client(
                service,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                aws_session_token=self.aws_session_token,
                region_name=self.aws_region,
            )
        elif self.aws_access_key_id and self.aws_secret_access_key:
            return boto3.client(
                service,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key,
                region_name=self.aws_region,
            )
        elif self.aws_role:
            if isinstance(self.aws_role, str):
                credentials = assume_role(self.aws_role, self.aws_region)
            else:
                credentials = reduce(
                    lambda new_credentials, role_arn: assume_role(
                        role_arn, self.aws_region, new_credentials),
                    self.aws_role,
                    {},
                )
            return boto3.client(
                service,
                aws_access_key_id=credentials["AccessKeyId"],
                aws_secret_access_key=credentials["SecretAccessKey"],
                aws_session_token=credentials["SessionToken"],
                region_name=self.aws_region,
            )
        else:
            return boto3.client(service, region_name=self.aws_region)

    @property
    def glue_client(self):
        return self.get_client("glue")

    @property
    def s3_client(self):
        return self.get_client("s3")
Ejemplo n.º 12
0
class LookMLSourceConfig(ConfigModel):  # pragma: no cover
    base_folder: str
    connection_to_platform_map: Dict[str, str]
    platform_name: str = "looker_views"
    actor: str = "urn:li:corpuser:etl"
    model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    env: str = "PROD"
    parse_table_names_from_sql: bool = False
Ejemplo n.º 13
0
class LookerDashboardSourceConfig(ConfigModel):
    client_id: str
    client_secret: str
    base_url: str
    platform_name: str = "looker"
    actor: str = "urn:li:corpuser:etl"
    dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    env: str = "PROD"
Ejemplo n.º 14
0
class LookMLSourceConfig(LookerCommonConfig):
    base_folder: pydantic.DirectoryPath
    connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]]
    model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    parse_table_names_from_sql: bool = False
    sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"
    api: Optional[LookerAPIConfig]
    project_name: Optional[str]
    transport_options: Optional[TransportOptions]

    @validator("platform_instance")
    def platform_instance_not_supported(cls, v: str) -> str:
        raise ConfigurationError(
            "LookML Source doesn't support platform instance at the top level. However connection-specific platform instances are supported for generating lineage edges. Read the documentation to find out more."
        )

    @validator("connection_to_platform_map", pre=True)
    def convert_string_to_connection_def(cls, conn_map):
        # Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap
        for key in conn_map:
            if isinstance(conn_map[key], str):
                platform = conn_map[key]
                if "." in platform:
                    platform_db_split = conn_map[key].split(".")
                    connection = LookerConnectionDefinition(
                        platform=platform_db_split[0],
                        default_db=platform_db_split[1],
                        default_schema="",
                    )
                    conn_map[key] = connection
                else:
                    logger.warning(
                        f"Connection map for {key} provides platform {platform} but does not provide a default database name. This might result in failed resolution"
                    )
                    conn_map[key] = LookerConnectionDefinition(
                        platform=platform, default_db="", default_schema="")
        return conn_map

    @root_validator()
    def check_either_connection_map_or_connection_provided(cls, values):
        """Validate that we must either have a connection map or an api credential"""
        if not values.get("connection_to_platform_map", {}) and not values.get(
                "api", {}):
            raise ConfigurationError(
                "Neither api not connection_to_platform_map config was found. LookML source requires either api credentials for Looker or a map of connection names to platform identifiers to work correctly"
            )
        return values

    @root_validator()
    def check_either_project_name_or_api_provided(cls, values):
        """Validate that we must either have a project name or an api credential to fetch project names"""
        if not values.get("project_name") and not values.get("api"):
            raise ConfigurationError(
                "Neither project_name not an API credential was found. LookML source requires either api credentials for Looker or a project_name to accurately name views and models."
            )
        return values
Ejemplo n.º 15
0
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path,
                      mock_time):

    # Run the metadata ingestion pipeline.
    with fs_helpers.isolated_filesystem(tmp_path):

        # Run the metadata ingestion pipeline for trino catalog referring to postgres database
        mce_out_file = "trino_mces.json"
        events_file = tmp_path / mce_out_file

        pipeline_config = {
            "run_id": "trino-test",
            "source": {
                "type":
                data_platform,
                "config":
                TrinoConfig(
                    host_port="localhost:5300",
                    database="postgresqldb",
                    database_alias="library_catalog",
                    username="******",
                    schema_pattern=AllowDenyPattern(allow=["^librarydb"]),
                    profile_pattern=AllowDenyPattern(
                        allow=["library_catalog.librarydb.*"]),
                    profiling=GEProfilingConfig(
                        enabled=True,
                        include_field_null_count=True,
                        include_field_min_value=True,
                        include_field_max_value=True,
                        include_field_mean_value=True,
                        include_field_median_value=True,
                        include_field_stddev_value=True,
                        include_field_quantiles=True,
                        include_field_distinct_value_frequencies=True,
                        include_field_histogram=True,
                        include_field_sample_values=True,
                    ),
                ).dict(),
            },
            "sink": {
                "type": "file",
                "config": FileSinkConfig(filename=str(events_file)).dict(),
            },
        }

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create(pipeline_config)
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)
        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path="trino_mces.json",
            golden_path=test_resources_dir / "trino_mces_golden.json",
        )
Ejemplo n.º 16
0
class LookerDashboardSourceConfig(LookerAPIConfig, LookerCommonConfig):
    platform_name: str = "looker"
    actor: Optional[str]
    dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    include_deleted: bool = False
    env: str = builder.DEFAULT_ENV
    extract_owners: bool = True
    strip_user_ids_from_email: bool = False
    skip_personal_folders: bool = False
Ejemplo n.º 17
0
def test_fully_speced():
    pattern = AllowDenyPattern(allow=["foo.mytable"])
    assert pattern.is_fully_specified_allow_list()
    pattern = AllowDenyPattern(allow=["foo.*", "foo.table"])
    assert not pattern.is_fully_specified_allow_list()
    pattern = AllowDenyPattern(allow=["foo.?", "foo.table"])
    assert not pattern.is_fully_specified_allow_list()
Ejemplo n.º 18
0
class MongoDBConfig(ConfigModel):
    # See the MongoDB authentication docs for details and examples.
    # https://pymongo.readthedocs.io/en/stable/examples/authentication.html
    connect_uri: str = "mongodb://localhost"
    username: Optional[str] = None
    password: Optional[str] = None
    authMechanism: Optional[str] = None
    options: dict = {}

    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
Ejemplo n.º 19
0
class LookerDashboardSourceConfig(ConfigModel):
    client_id: str
    client_secret: str
    base_url: str
    platform_name: str = "looker"
    # The datahub platform where looker views are stored, must be the same as `platform_name` in lookml source
    view_platform_name: str = "looker_views"
    actor: str = "urn:li:corpuser:etl"
    dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    env: str = "PROD"
Ejemplo n.º 20
0
class RedashConfig(ConfigModel):
    # See the Redash API for details
    # https://redash.io/help/user-guide/integrations-and-api/api
    connect_uri: str = "http://localhost:5000"
    api_key: str = "REDASH_API_KEY"
    env: str = DEFAULT_ENV

    # Optionals
    dashboard_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
    skip_draft: bool = True
    api_page_limit: int = sys.maxsize
Ejemplo n.º 21
0
class SnowflakeUsageConfig(BaseSnowflakeConfig, BaseUsageConfig,
                           StatefulIngestionConfigBase):
    options: dict = pydantic.Field(
        default_factory=dict,
        description=
        "Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details.",
    )

    database_pattern: AllowDenyPattern = pydantic.Field(
        default=AllowDenyPattern(
            deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]),
        description=
        "List of regex patterns for databases to include/exclude in usage ingestion.",
    )
    email_domain: Optional[str] = pydantic.Field(
        description=
        "Email domain of your organisation so users can be displayed on UI appropriately."
    )
    schema_pattern: AllowDenyPattern = pydantic.Field(
        default=AllowDenyPattern.allow_all(),
        description=
        "List of regex patterns for schemas to include/exclude in usage ingestion.",
    )
    table_pattern: AllowDenyPattern = pydantic.Field(
        default=AllowDenyPattern.allow_all(),
        description=
        "List of regex patterns for tables to include in ingestion.",
    )
    view_pattern: AllowDenyPattern = pydantic.Field(
        default=AllowDenyPattern.allow_all(),
        description="List of regex patterns for views to include in ingestion.",
    )
    apply_view_usage_to_tables: bool = pydantic.Field(
        default=False,
        description="Allow/deny patterns for views in snowflake dataset names.",
    )
    stateful_ingestion: Optional[
        SnowflakeStatefulIngestionConfig] = pydantic.Field(
            default=None, description="Stateful ingestion related configs")

    def get_options(self) -> dict:
        options_connect_args: Dict = super().get_sql_alchemy_connect_args()
        options_connect_args.update(self.options.get("connect_args", {}))
        self.options["connect_args"] = options_connect_args
        return self.options

    def get_sql_alchemy_url(self):
        return super().get_sql_alchemy_url(
            database="snowflake",
            username=self.username,
            password=self.password,
            role=self.role,
        )
Ejemplo n.º 22
0
class LookerDashboardSourceConfig(LookerAPIConfig, LookerCommonConfig):
    dashboard_pattern: AllowDenyPattern = Field(
        AllowDenyPattern.allow_all(),
        description=
        "Patterns for selecting dashboard ids that are to be included",
    )
    chart_pattern: AllowDenyPattern = Field(
        AllowDenyPattern.allow_all(),
        description="Patterns for selecting chart ids that are to be included",
    )
    include_deleted: bool = Field(
        False, description="Whether to include deleted dashboards.")
    extract_owners: bool = Field(
        True,
        description=
        "When enabled, extracts ownership from Looker directly. When disabled, ownership is left empty for dashboards and charts.",
    )
    actor: Optional[str] = Field(
        None,
        description=
        "This config is deprecated in favor of `extract_owners`. Previously, was the actor to use in ownership properties of ingested metadata.",
    )
    strip_user_ids_from_email: bool = Field(
        False,
        description=
        "When enabled, converts Looker user emails of the form [email protected] to urn:li:corpuser:name when assigning ownership",
    )
    skip_personal_folders: bool = Field(
        False,
        description=
        "Whether to skip ingestion of dashboards in personal folders. Setting this to True will only ingest dashboards in the Shared folder space.",
    )
    max_threads: int = Field(
        os.cpu_count() or 40,
        description=
        "Max parallelism for Looker API calls. Defaults to cpuCount or 40",
    )
    external_base_url: Optional[str] = Field(
        None,
        description=
        "Optional URL to use when constructing external URLs to Looker if the `base_url` is not the correct one to use. For example, `https://looker-public.company.com`. If not provided, the external base URL will default to `base_url`.",
    )

    @validator("external_base_url", pre=True, always=True)
    def external_url_defaults_to_api_config_base_url(
            cls, v: Optional[str], *, values: Dict[str, Any],
            **kwargs: Dict[str, Any]) -> str:
        return v or values["base_url"]

    @validator("platform_instance")
    def platform_instance_not_supported(cls, v: str) -> str:
        raise ConfigurationError(
            "Looker Source doesn't support platform instances")
Ejemplo n.º 23
0
class MongoDBConfig(ConfigModel):
    # See the MongoDB authentication docs for details and examples.
    # https://pymongo.readthedocs.io/en/stable/examples/authentication.html
    connect_uri: str = "mongodb://localhost"
    username: Optional[str] = None
    password: Optional[str] = None
    authMechanism: Optional[str] = None
    options: dict = {}
    enableSchemaInference: bool = True
    schemaSamplingSize: Optional[PositiveInt] = 1000
    env: str = DEFAULT_ENV

    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
Ejemplo n.º 24
0
class RedashConfig(ConfigModel):
    # See the Redash API for details
    # https://redash.io/help/user-guide/integrations-and-api/api
    connect_uri: str = "http://localhost:5000"
    api_key: str = "REDASH_API_KEY"
    env: str = DEFAULT_ENV

    # Optionals
    dashboard_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
    skip_draft: bool = True
    api_page_limit: int = sys.maxsize
    parse_table_names_from_sql: bool = False
    sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"
Ejemplo n.º 25
0
class MongoDBConfig(EnvBasedSourceConfigBase):
    # See the MongoDB authentication docs for details and examples.
    # https://pymongo.readthedocs.io/en/stable/examples/authentication.html
    connect_uri: str = Field(default="mongodb://localhost",
                             description="MongoDB connection URI.")
    username: Optional[str] = Field(default=None,
                                    description="MongoDB username.")
    password: Optional[str] = Field(default=None,
                                    description="MongoDB password.")
    authMechanism: Optional[str] = Field(
        default=None, description="MongoDB authentication mechanism.")
    options: dict = Field(
        default={},
        description="Additional options to pass to `pymongo.MongoClient()`.")
    enableSchemaInference: bool = Field(
        default=True, description="Whether to infer schemas. ")
    schemaSamplingSize: Optional[PositiveInt] = Field(
        default=1000,
        description=
        "Number of documents to use when inferring schema size. If set to `0`, all documents will be scanned.",
    )
    useRandomSampling: bool = Field(
        default=True,
        description=
        "If documents for schema inference should be randomly selected. If `False`, documents will be selected from start.",
    )
    maxSchemaSize: Optional[PositiveInt] = Field(
        default=300,
        description="Maximum number of fields to include in the schema.")
    # mongodb only supports 16MB as max size for documents. However, if we try to retrieve a larger document it
    # errors out with "16793600" as the maximum size supported.
    maxDocumentSize: Optional[PositiveInt] = Field(default=16793600,
                                                   description="")

    database_pattern: AllowDenyPattern = Field(
        default=AllowDenyPattern.allow_all(),
        description="regex patterns for databases to filter in ingestion.",
    )
    collection_pattern: AllowDenyPattern = Field(
        default=AllowDenyPattern.allow_all(),
        description="regex patterns for collections to filter in ingestion.",
    )

    @validator("maxDocumentSize")
    def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value):
        if doc_size_filter_value > 16793600:
            raise ValueError(
                "maxDocumentSize must be a positive value <= 16793600.")
        return doc_size_filter_value
Ejemplo n.º 26
0
def test_case_sensitivity():
    pattern = AllowDenyPattern(allow=["Foo.myTable"])
    assert pattern.allowed("foo.mytable")
    assert pattern.allowed("FOO.MYTABLE")
    assert pattern.allowed("Foo.MyTable")
    pattern = AllowDenyPattern(allow=["Foo.myTable"], ignoreCase=False)
    assert not pattern.allowed("foo.mytable")
    assert pattern.allowed("Foo.myTable")
Ejemplo n.º 27
0
class NifiSourceConfig(ConfigModel):
    site_url: str

    auth: NifiAuthType = NifiAuthType.NO_AUTH

    provenance_days: int = 7  # Fetch provenance events for past 1 week
    process_group_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()

    # Required for nifi deployments using Remote Process Groups
    site_name: str = "default"
    site_url_to_site_name: Dict[str, str] = {}

    # Required to be set if auth is of type SINGLE_USER
    username: Optional[str]
    password: Optional[str]

    # Required to be set if auth is of type CLIENT_CERT
    client_cert_file: Optional[str]
    client_key_file: Optional[str]
    client_key_password: Optional[str]

    # Required to be set if nifi server certificate is not signed by
    # root CA trusted by client system, e.g. self-signed certificates
    ca_file: Optional[str]

    env: str = builder.DEFAULT_ENV
Ejemplo n.º 28
0
class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig):
    database_pattern: AllowDenyPattern = AllowDenyPattern(
        deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]
    )

    provision_role: Optional[SnowflakeProvisionRoleConfig] = None
    ignore_start_time_lineage: bool = False
    upstream_lineage_in_report: bool = False

    def get_sql_alchemy_url(
        self,
        database: str = None,
        username: Optional[str] = None,
        password: Optional[pydantic.SecretStr] = None,
        role: Optional[str] = None,
    ) -> str:
        return super().get_sql_alchemy_url(
            database=database, username=username, password=password, role=role
        )

    def get_options(self) -> dict:
        options_connect_args: Dict = super().get_sql_alchemy_connect_args()
        options_connect_args.update(self.options.get("connect_args", {}))
        self.options["connect_args"] = options_connect_args
        return self.options
Ejemplo n.º 29
0
            def is_dataset_pattern_allowed(
                    dataset_name: Optional[Any],
                    dataset_type: Optional[Any]) -> bool:
                # TODO: support table/view patterns for usage logs by pulling that information as well from the usage query
                if not dataset_type or not dataset_name:
                    return True

                table_or_view_pattern: Optional[
                    AllowDenyPattern] = AllowDenyPattern.allow_all()
                # Test domain type = external_table and then add it
                table_or_view_pattern = (
                    self.config.table_pattern
                    if dataset_type.lower() in {"table"} else
                    (self.config.view_pattern if dataset_type.lower()
                     in {"view", "materialized_view"} else None))
                if table_or_view_pattern is None:
                    return True

                dataset_params = dataset_name.split(".")
                assert len(dataset_params) == 3
                if (not self.config.database_pattern.allowed(dataset_params[0])
                        or not self.config.schema_pattern.allowed(
                            dataset_params[1]) or
                        not table_or_view_pattern.allowed(dataset_params[2])):
                    return False
                return True
Ejemplo n.º 30
0
class ElasticsearchSourceConfig(ConfigModel):
    host: str = "localhost:9092"
    username: str = ""
    password: str = ""
    env: str = DEFAULT_ENV
    index_pattern: AllowDenyPattern = AllowDenyPattern(
        allow=[".*"], deny=["^_.*", "^ilm-history.*"])

    @validator("host")
    def host_colon_port_comma(cls, host_val: str) -> str:
        for entry in host_val.split(","):
            # The port can be provided but is not required.
            port = None
            if ":" in entry:
                (host, port) = entry.rsplit(":", 1)
            else:
                host = entry
            assert re.match(
                # This regex is quite loose. Many invalid hostnames or IPs will slip through,
                # but it serves as a good first line of validation. We defer to Kafka for the
                # remaining validation.
                r"^[\w\-\.\:]+$",
                host,
            ), f"host contains bad characters, found {host}"
            if port is not None:
                assert port.isdigit(), f"port must be all digits, found {port}"
        return host_val