Example #1
0
def metadata_scan(
    catalog: Catalog,
    detectors: List[MetadataDetector],
    work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None,
                              None],
    generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],
):
    total_columns = len([c for s, t, c in work_generator])

    counter = 0
    set_number = 0
    for schema, table, column in tqdm(generator,
                                      total=total_columns,
                                      desc="columns",
                                      unit="columns"):
        counter += 1
        LOGGER.debug("Scanning column name %s", column.fqdn)
        for detector in detectors:
            type = detector.detect(column)
            if type is not None:
                set_number += 1
                catalog.set_column_pii_type(column=column,
                                            pii_type=type,
                                            pii_plugin=detector.name)
                break

    LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter,
                set_number)
Example #2
0
def data_scan(
    catalog: Catalog,
    detectors: List[DatumDetector],
    work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None,
                              None],
    generator: Generator[Tuple[CatSchema, CatTable, CatColumn, str], None,
                         None],
    sample_size: int = SMALL_TABLE_MAX,
):
    total_columns = _filter_text_columns([c for s, t, c in work_generator])
    total_work = len(total_columns) * sample_size

    counter = 0
    set_number = 0

    for schema, table, column, val in tqdm(generator,
                                           total=total_work,
                                           desc="datum",
                                           unit="datum"):
        counter += 1
        LOGGER.debug("Scanning column name %s", column.fqdn)
        if val is not None:
            for detector in detectors:
                type = detector.detect(column=column, datum=val)
                if type is not None:
                    set_number += 1

                    catalog.set_column_pii_type(column=column,
                                                pii_type=type,
                                                pii_plugin=detector.name)
                    LOGGER.debug("{} has {}".format(column.fqdn, type))

                    scan_logger.info("deep_scan",
                                     extra={
                                         "column": column.fqdn,
                                         "pii_types": type
                                     })
                    data_logger.info(
                        "deep_scan",
                        extra={
                            "column": column.fqdn,
                            "data": val,
                            "pii_types": type
                        },
                    )
                    break
    LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter,
                set_number)