Example #1
0
def test_data_generator(sqlalchemy_engine):
    catalog, source, conn = sqlalchemy_engine

    count = 0
    for tpl in data_generator(catalog=catalog, source=source):
        count += 1

    assert count == 14
Example #2
0
def test_data_generator_include_int_table(load_source):
    catalog, source = load_source

    count = 0
    for tpl in data_generator(catalog=catalog,
                              source=source,
                              include_table_regex_str=["partial_data_type"]):
        count += 1

    assert count == 2
Example #3
0
def test_data_generator_exclude_table(load_source):
    catalog, source = load_source

    count = 0
    for tpl in data_generator(catalog=catalog,
                              source=source,
                              exclude_table_regex_str=["full.*"]):
        count += 1

    assert count == 10
Example #4
0
def test_data_generator_exclude_schema(load_source):
    catalog, source = load_source
    schemata = catalog.search_schema(source_like=source.name, schema_like="%")

    count = 0
    for tpl in data_generator(catalog=catalog,
                              source=source,
                              exclude_schema_regex_str=[schemata[0].name]):
        count += 1

    assert count == 0
Example #5
0
def test_incremental_data_generator(setup_incremental):
    catalog, source_id = setup_incremental

    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        tasks = catalog.get_tasks_by_app_name("piicatcher.{}".format(source.name))

        count = 0
        for tpl in data_generator(catalog=catalog, source=source):
            count += 1

        assert count == 434

        count = 0
        for tpl in data_generator(
            catalog=catalog, source=source, last_run=tasks[0].updated_at
        ):
            count += 1

        assert count == 14
Example #6
0
def test_deep_scan(load_data_and_pull):
    catalog, source_id = load_data_and_pull
    with catalog.managed_session:
        source = catalog.get_source_by_id(source_id)
        data_scan(
            catalog=catalog,
            detectors=[DatumRegexDetector()],
            work_generator=column_generator(catalog=catalog, source=source),
            generator=data_generator(catalog=catalog, source=source),
        )

        schemata = catalog.search_schema(source_like=source.name,
                                         schema_like="%")
        state = catalog.get_column(
            source_name=source.name,
            schema_name=schemata[0].name,
            table_name="partial_pii",
            column_name="a",
        )
        assert state.pii_type == Phone()
Example #7
0
def scan_database(
    catalog: Catalog,
    source: CatSource,
    scan_type: ScanTypeEnum = ScanTypeEnum.metadata,
    incremental: bool = True,
    output_format: OutputFormat = OutputFormat.tabular,
    list_all: bool = False,
    include_schema_regex: List[str] = None,
    exclude_schema_regex: List[str] = None,
    include_table_regex: List[str] = None,
    exclude_table_regex: List[str] = None,
    sample_size: int = SMALL_TABLE_MAX,
) -> Union[List[Any], Dict[Any, Any]]:
    message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \
            exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format(
        source_name=source.name,
        scan_type=str(scan_type),
        include_schema=",".join(include_schema_regex)
        if include_schema_regex is not None else "None",
        exclude_schema=",".join(exclude_schema_regex)
        if exclude_schema_regex is not None else "None",
        include_table=",".join(include_table_regex)
        if include_table_regex is not None else "None",
        exclude_table=",".join(exclude_table_regex)
        if exclude_table_regex is not None else "None",
    )

    status_message = "Success"
    exit_code = 0

    with catalog.managed_session:
        scan_sources(
            catalog=catalog,
            source_names=[source.name],
            include_schema_regex=include_schema_regex,
            exclude_schema_regex=exclude_schema_regex,
            include_table_regex=include_table_regex,
            exclude_table_regex=exclude_table_regex,
        )

        last_run: Optional[datetime.datetime] = None
        if incremental:
            last_task = catalog.get_latest_task("piicatcher.{}".format(
                source.name))
            last_run = last_task.updated_at if last_task is not None else None
            if last_run is not None:
                LOGGER.debug("Last Run at {}", last_run)
            else:
                LOGGER.debug("No last run found")

        try:
            scan_sources(
                catalog=catalog,
                source_names=[source.name],
                include_schema_regex=include_schema_regex,
                exclude_schema_regex=exclude_schema_regex,
                include_table_regex=include_table_regex,
                exclude_table_regex=exclude_table_regex,
            )

            if scan_type == ScanTypeEnum.metadata:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, MetadataDetector)
                ]

                metadata_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                )
            else:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, DatumDetector)
                ]

                data_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=data_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                        sample_size=sample_size,
                    ),
                    sample_size=sample_size,
                )

            if output_format == OutputFormat.tabular:
                return output_tabular(catalog=catalog,
                                      source=source,
                                      list_all=list_all,
                                      last_run=last_run)
            else:
                return output_dict(catalog=catalog,
                                   source=source,
                                   list_all=list_all,
                                   last_run=last_run)
        except Exception as e:
            status_message = str(e)
            exit_code = 1
            raise e
        finally:
            catalog.add_task(
                "piicatcher.{}".format(source.name),
                exit_code,
                "{}.{}".format(message, status_message),
            )