def scan( source_name: Optional[List[str]] = typer.Option( None, help="List of names of database and data warehouses"), include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text), exclude_schema: Optional[List[str]] = typer.Option( None, help=exclude_schema_help_text), include_table: Optional[List[str]] = typer.Option(None, help=table_help_text), exclude_table: Optional[List[str]] = typer.Option( None, help=exclude_table_help_text), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) try: scan_sources( catalog=catalog, source_names=source_name, include_schema_regex=include_schema, exclude_schema_regex=exclude_schema, include_table_regex=include_table, exclude_table_regex=exclude_table, ) except NoMatchesError: typer.echo( "No schema or tables scanned. Ensure include/exclude patterns are correct " "and database has tables")
def test_pull_exclude_table(setup_catalog_and_data): catalog = setup_catalog_and_data scan_sources(catalog, ["pg"], exclude_table_regex=["full.*", "partial.*"]) with catalog.managed_session: source = catalog.get_source("pg") assert source is not None schemata = source.schemata assert len(schemata) == 1 tables = schemata[0].tables assert len(tables) == 1 assert tables[0].name == "no_pii"
def test_pull_include_table_list(setup_catalog_and_data): catalog = setup_catalog_and_data scan_sources(catalog, ["sqlite_db"], include_table_regex=["full.*", "partial.*"]) with catalog.managed_session: source = catalog.get_source("sqlite_db") assert source is not None schemata = source.schemata assert len(schemata) == 1 tables = schemata[0].tables assert len(tables) == 2 assert tables[0].name == "full_pii" assert tables[1].name == "partial_pii"
def setup_incremental( load_sample_data, load_data ) -> Generator[Tuple[Catalog, int], None, None]: catalog, source_id, name = load_sample_data with catalog.managed_session: scan_sources(catalog, [name], include_table_regex=["sample"]) time.sleep(1) with catalog.managed_session: source = catalog.get_source_by_id(source_id) scan_database(catalog=catalog, source=source, include_table_regex=["sample"]) time.sleep(1) with catalog.managed_session: scan_sources(catalog, [name]) time.sleep(1) with catalog.managed_session: scan_database(catalog=catalog, source=source, include_table_regex=["partial.*"]) yield catalog, source_id
def scan( source_name: Optional[List[str]] = typer.Option( None, help="List of names of database and data warehouses" ), include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text), exclude_schema: Optional[List[str]] = typer.Option( None, help=exclude_schema_help_text ), include_table: Optional[List[str]] = typer.Option(None, help=table_help_text), exclude_table: Optional[List[str]] = typer.Option( None, help=exclude_table_help_text ), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): init_db(catalog) try: scan_sources( catalog=catalog, source_names=source_name, include_schema_regex=include_schema, exclude_schema_regex=exclude_schema, include_table_regex=include_table, exclude_table_regex=exclude_table, ) except NoMatchesError: typer.echo( "No schema or tables scanned. Ensure include/exclude patterns are correct " "and database has tables" )
def scan_database( catalog: Catalog, source: CatSource, scan_type: ScanTypeEnum = ScanTypeEnum.metadata, incremental: bool = True, output_format: OutputFormat = OutputFormat.tabular, list_all: bool = False, include_schema_regex: List[str] = None, exclude_schema_regex: List[str] = None, include_table_regex: List[str] = None, exclude_table_regex: List[str] = None, sample_size: int = SMALL_TABLE_MAX, ) -> Union[List[Any], Dict[Any, Any]]: message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \ exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format( source_name=source.name, scan_type=str(scan_type), include_schema=",".join(include_schema_regex) if include_schema_regex is not None else "None", exclude_schema=",".join(exclude_schema_regex) if exclude_schema_regex is not None else "None", include_table=",".join(include_table_regex) if include_table_regex is not None else "None", exclude_table=",".join(exclude_table_regex) if exclude_table_regex is not None else "None", ) status_message = "Success" exit_code = 0 with catalog.managed_session: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) last_run: Optional[datetime.datetime] = None if incremental: last_task = catalog.get_latest_task("piicatcher.{}".format( source.name)) last_run = last_task.updated_at if last_task is not None else None if last_run is not None: LOGGER.debug("Last Run at {}", last_run) else: LOGGER.debug("No last run found") try: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) if scan_type == ScanTypeEnum.metadata: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, MetadataDetector) ] metadata_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), ) else: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, DatumDetector) ] data_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=data_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, sample_size=sample_size, ), sample_size=sample_size, ) if output_format == OutputFormat.tabular: return output_tabular(catalog=catalog, source=source, list_all=list_all, last_run=last_run) else: return output_dict(catalog=catalog, source=source, list_all=list_all, last_run=last_run) except Exception as e: status_message = str(e) exit_code = 1 raise e finally: catalog.add_task( "piicatcher.{}".format(source.name), exit_code, "{}.{}".format(message, status_message), )
def test_pull(setup_catalog_and_data, source): catalog = setup_catalog_and_data scan_sources(catalog, [source]) run_asserts(catalog, source)
def test_pull_all(setup_catalog_and_data): catalog = setup_catalog_and_data scan_sources(catalog, ["pg", "mysql", "sqlite_db"]) run_asserts(catalog, "pg") run_asserts(catalog, "mysql") run_asserts(catalog, "sqlite_db")
def load_data_and_pull( load_data) -> Generator[Tuple[Catalog, str, int], None, None]: catalog, conf, source_id, name = load_data scan_sources(catalog, [name]) yield catalog, conf, source_id