def metadata_scan( catalog: Catalog, detectors: List[MetadataDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], ): total_columns = len([c for s, t, c in work_generator]) counter = 0 set_number = 0 for schema, table, column in tqdm(generator, total=total_columns, desc="columns", unit="columns"): counter += 1 LOGGER.debug("Scanning column name %s", column.fqdn) for detector in detectors: type = detector.detect(column) if type is not None: set_number += 1 catalog.set_column_pii_type(column=column, pii_type=type, pii_plugin=detector.name) break LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter, set_number)
def data_scan( catalog: Catalog, detectors: List[DatumDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], generator: Generator[Tuple[CatSchema, CatTable, CatColumn, str], None, None], sample_size: int = SMALL_TABLE_MAX, ): total_columns = _filter_text_columns([c for s, t, c in work_generator]) total_work = len(total_columns) * sample_size counter = 0 set_number = 0 for schema, table, column, val in tqdm(generator, total=total_work, desc="datum", unit="datum"): counter += 1 LOGGER.debug("Scanning column name %s", column.fqdn) if val is not None: for detector in detectors: type = detector.detect(column=column, datum=val) if type is not None: set_number += 1 catalog.set_column_pii_type(column=column, pii_type=type, pii_plugin=detector.name) LOGGER.debug("{} has {}".format(column.fqdn, type)) scan_logger.info("deep_scan", extra={ "column": column.fqdn, "pii_types": type }) data_logger.info( "deep_scan", extra={ "column": column.fqdn, "data": val, "pii_types": type }, ) break LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter, set_number)