Example #1
0
File: cli.py Project: vrajat/dbcat
def scan(
    source_name: Optional[List[str]] = typer.Option(
        None, help="List of names of database and data warehouses"),
    include_schema: Optional[List[str]] = typer.Option(None,
                                                       help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text),
    include_table: Optional[List[str]] = typer.Option(None,
                                                      help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)
        try:
            scan_sources(
                catalog=catalog,
                source_names=source_name,
                include_schema_regex=include_schema,
                exclude_schema_regex=exclude_schema,
                include_table_regex=include_table,
                exclude_table_regex=exclude_table,
            )
        except NoMatchesError:
            typer.echo(
                "No schema or tables scanned. Ensure include/exclude patterns are correct "
                "and database has tables")
Example #2
0
def test_pull_exclude_table(setup_catalog_and_data):
    catalog = setup_catalog_and_data
    scan_sources(catalog, ["pg"], exclude_table_regex=["full.*", "partial.*"])

    with catalog.managed_session:
        source = catalog.get_source("pg")
        assert source is not None

        schemata = source.schemata
        assert len(schemata) == 1

        tables = schemata[0].tables
        assert len(tables) == 1

        assert tables[0].name == "no_pii"
Example #3
0
def test_pull_include_table_list(setup_catalog_and_data):
    catalog = setup_catalog_and_data
    scan_sources(catalog, ["sqlite_db"],
                 include_table_regex=["full.*", "partial.*"])

    with catalog.managed_session:
        source = catalog.get_source("sqlite_db")
        assert source is not None

        schemata = source.schemata
        assert len(schemata) == 1

        tables = schemata[0].tables
        assert len(tables) == 2

        assert tables[0].name == "full_pii"
        assert tables[1].name == "partial_pii"
Example #4
0
def setup_incremental(
    load_sample_data, load_data
) -> Generator[Tuple[Catalog, int], None, None]:
    catalog, source_id, name = load_sample_data
    with catalog.managed_session:
        scan_sources(catalog, [name], include_table_regex=["sample"])
    time.sleep(1)
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(catalog=catalog, source=source, include_table_regex=["sample"])
    time.sleep(1)
    with catalog.managed_session:
        scan_sources(catalog, [name])
    time.sleep(1)
    with catalog.managed_session:
        scan_database(catalog=catalog, source=source, include_table_regex=["partial.*"])
        yield catalog, source_id
Example #5
0
File: cli.py Project: tokern/dbcat
def scan(
    source_name: Optional[List[str]] = typer.Option(
        None, help="List of names of database and data warehouses"
    ),
    include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text
    ),
    include_table: Optional[List[str]] = typer.Option(None, help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text
    ),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        init_db(catalog)
        try:
            scan_sources(
                catalog=catalog,
                source_names=source_name,
                include_schema_regex=include_schema,
                exclude_schema_regex=exclude_schema,
                include_table_regex=include_table,
                exclude_table_regex=exclude_table,
            )
        except NoMatchesError:
            typer.echo(
                "No schema or tables scanned. Ensure include/exclude patterns are correct "
                "and database has tables"
            )
Example #6
0
def scan_database(
    catalog: Catalog,
    source: CatSource,
    scan_type: ScanTypeEnum = ScanTypeEnum.metadata,
    incremental: bool = True,
    output_format: OutputFormat = OutputFormat.tabular,
    list_all: bool = False,
    include_schema_regex: List[str] = None,
    exclude_schema_regex: List[str] = None,
    include_table_regex: List[str] = None,
    exclude_table_regex: List[str] = None,
    sample_size: int = SMALL_TABLE_MAX,
) -> Union[List[Any], Dict[Any, Any]]:
    message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \
            exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format(
        source_name=source.name,
        scan_type=str(scan_type),
        include_schema=",".join(include_schema_regex)
        if include_schema_regex is not None else "None",
        exclude_schema=",".join(exclude_schema_regex)
        if exclude_schema_regex is not None else "None",
        include_table=",".join(include_table_regex)
        if include_table_regex is not None else "None",
        exclude_table=",".join(exclude_table_regex)
        if exclude_table_regex is not None else "None",
    )

    status_message = "Success"
    exit_code = 0

    with catalog.managed_session:
        scan_sources(
            catalog=catalog,
            source_names=[source.name],
            include_schema_regex=include_schema_regex,
            exclude_schema_regex=exclude_schema_regex,
            include_table_regex=include_table_regex,
            exclude_table_regex=exclude_table_regex,
        )

        last_run: Optional[datetime.datetime] = None
        if incremental:
            last_task = catalog.get_latest_task("piicatcher.{}".format(
                source.name))
            last_run = last_task.updated_at if last_task is not None else None
            if last_run is not None:
                LOGGER.debug("Last Run at {}", last_run)
            else:
                LOGGER.debug("No last run found")

        try:
            scan_sources(
                catalog=catalog,
                source_names=[source.name],
                include_schema_regex=include_schema_regex,
                exclude_schema_regex=exclude_schema_regex,
                include_table_regex=include_table_regex,
                exclude_table_regex=exclude_table_regex,
            )

            if scan_type == ScanTypeEnum.metadata:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, MetadataDetector)
                ]

                metadata_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                )
            else:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, DatumDetector)
                ]

                data_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=data_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                        sample_size=sample_size,
                    ),
                    sample_size=sample_size,
                )

            if output_format == OutputFormat.tabular:
                return output_tabular(catalog=catalog,
                                      source=source,
                                      list_all=list_all,
                                      last_run=last_run)
            else:
                return output_dict(catalog=catalog,
                                   source=source,
                                   list_all=list_all,
                                   last_run=last_run)
        except Exception as e:
            status_message = str(e)
            exit_code = 1
            raise e
        finally:
            catalog.add_task(
                "piicatcher.{}".format(source.name),
                exit_code,
                "{}.{}".format(message, status_message),
            )
Example #7
0
def test_pull(setup_catalog_and_data, source):
    catalog = setup_catalog_and_data
    scan_sources(catalog, [source])
    run_asserts(catalog, source)
Example #8
0
def test_pull_all(setup_catalog_and_data):
    catalog = setup_catalog_and_data
    scan_sources(catalog, ["pg", "mysql", "sqlite_db"])
    run_asserts(catalog, "pg")
    run_asserts(catalog, "mysql")
    run_asserts(catalog, "sqlite_db")
Example #9
0
def load_data_and_pull(
        load_data) -> Generator[Tuple[Catalog, str, int], None, None]:
    catalog, conf, source_id, name = load_data
    scan_sources(catalog, [name])
    yield catalog, conf, source_id