Beispiel #1
0
def test_scan_database_shallow(load_sample_data_and_pull):
    catalog, source_id = load_sample_data_and_pull
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(catalog=catalog,
                      source=source,
                      include_table_regex=["sample"])

        schemata = catalog.search_schema(source_like=source.name,
                                         schema_like="%")

        for column_name, pii_type in [
            ("address", piicatcher.Address()),
            ("city", piicatcher.Address()),
            ("email", piicatcher.Email()),
            ("fname", piicatcher.Person()),
            ("gender", piicatcher.Gender()),
            ("lname", piicatcher.Person()),
            ("maiden_name", piicatcher.Person()),
            ("state", piicatcher.Address()),
        ]:
            column = catalog.get_column(
                source_name=source.name,
                schema_name=schemata[0].name,
                table_name="sample",
                column_name=column_name,
            )
            assert column.pii_type == pii_type

        latest_task = catalog.get_latest_task("piicatcher.{}".format(
            source.name))
        assert latest_task.status == 0
        assert latest_task.created_at is not None
        assert latest_task.updated_at is not None
Beispiel #2
0
def test_full_scan(setup_incremental):
    catalog, source_id = setup_incremental

    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        time.sleep(1)
        scan_database(catalog=catalog, source=source, incremental=False)
        # there should be 3 tasks.
        tasks = catalog.get_tasks_by_app_name("piicatcher.{}".format(source.name))
        assert len(tasks) == 3

        schemata = catalog.search_schema(source_like=source.name, schema_like="%")

        updated_cols = 0
        for table_name in [
            "no_pii",
            "full_pii",
            "partial_pii",
            "partial_data_type",
            "sample",
        ]:
            table = catalog.get_table(
                source_name=source.name,
                schema_name=schemata[0].name,
                table_name=table_name,
            )
            updated_cols += len(
                list(
                    catalog.get_columns_for_table(table, newer_than=tasks[1].updated_at)
                )
            )

        assert updated_cols == 11
Beispiel #3
0
def test_output_tabular_all(load_data_and_pull):
    catalog, source_id = load_data_and_pull
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(catalog=catalog, source=source)

        result = output_tabular(catalog=catalog, source=source, list_all=True)
        if source.source_type == "mysql":
            assert result == mysql_output_tabular_all
        elif source.source_type == "postgresql":
            assert result == pg_output_tabular_all
        elif source.source_type == "sqlite":
            assert result == sqlite_output_tabular_all
Beispiel #4
0
def test_output_tabular(load_data_and_pull):
    catalog, source_id = load_data_and_pull
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(catalog=catalog,
                      source=source,
                      exclude_table_regex=["partial_data_type"])

        result = output_tabular(catalog=catalog, source=source, list_all=False)
        if source.source_type == "mysql":
            assert result == mysql_output_tabular
        elif source.source_type == "postgresql":
            assert result == pg_output_tabular
        elif source.source_type == "sqlite":
            assert result == sqlite_output_tabular
Beispiel #5
0
def setup_incremental(
    load_sample_data, load_data
) -> Generator[Tuple[Catalog, int], None, None]:
    catalog, source_id, name = load_sample_data
    with catalog.managed_session:
        scan_sources(catalog, [name], include_table_regex=["sample"])
    time.sleep(1)
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(catalog=catalog, source=source, include_table_regex=["sample"])
    time.sleep(1)
    with catalog.managed_session:
        scan_sources(catalog, [name])
    time.sleep(1)
    with catalog.managed_session:
        scan_database(catalog=catalog, source=source, include_table_regex=["partial.*"])
        yield catalog, source_id
Beispiel #6
0
def test_scan_database_deep(load_sample_data_and_pull):
    catalog, source_id = load_sample_data_and_pull
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        scan_database(
            catalog=catalog,
            source=source,
            include_table_regex=["sample"],
            scan_type=ScanTypeEnum.data,
        )

        schemata = catalog.search_schema(source_like=source.name,
                                         schema_like="%")

        for column_name, pii_type in [("id", piicatcher.BirthDate)]:
            column = catalog.get_column(
                source_name=source.name,
                schema_name=schemata[0].name,
                table_name="sample",
                column_name=column_name,
            )
            assert column.pii_type == pii_type
Beispiel #7
0
def detect(
    source_name: str = typer.Option(..., help="Name of database to scan."),
    scan_type: ScanTypeEnum = typer.Option(
        ScanTypeEnum.metadata,
        help="Choose deep(scan data) or shallow(scan column names only)",
    ),
    incremental: bool = typer.Option(
        True,
        help="Scan columns updated or created since last run",
    ),
    list_all: bool = typer.Option(
        False,
        help=
        "List all columns. By default only columns with PII information is listed",
    ),
    include_schema: Optional[List[str]] = typer.Option(None,
                                                       help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text),
    include_table: Optional[List[str]] = typer.Option(None,
                                                      help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text),
    sample_size: int = typer.Option(
        SMALL_TABLE_MAX,
        help="Sample size for large tables when running deep scan."),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )

    with closing(catalog) as catalog:
        init_db(catalog)
        with catalog.managed_session:
            source = catalog.get_source(source_name)
            try:
                op = scan_database(
                    catalog=catalog,
                    source=source,
                    scan_type=scan_type,
                    incremental=incremental,
                    output_format=dbcat.settings.OUTPUT_FORMAT,
                    list_all=list_all,
                    include_schema_regex=include_schema,
                    exclude_schema_regex=exclude_schema,
                    include_table_regex=include_table,
                    exclude_table_regex=exclude_table,
                    sample_size=sample_size,
                )
                typer.echo(
                    message=str_output(op, dbcat.settings.OUTPUT_FORMAT))
            except NoMatchesError:
                typer.echo(message=NoMatchesError.message)
                typer.Exit(1)