def metadata_scan( catalog: Catalog, detectors: List[MetadataDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], ): total_columns = len([c for s, t, c in work_generator]) counter = 0 set_number = 0 for schema, table, column in tqdm(generator, total=total_columns, desc="columns", unit="columns"): counter += 1 LOGGER.debug("Scanning column name %s", column.fqdn) for detector in detectors: type = detector.detect(column) if type is not None: set_number += 1 catalog.set_column_pii_type(column=column, pii_type=type, pii_plugin=detector.name) break LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter, set_number)
def extract_lineage( catalog: Catalog, visited_query: DmlVisitor, source: CatSource, parsed: Parsed, start_time, end_time, ) -> JobExecution: job = catalog.add_job(name=parsed.name, source=source, context={"query": parsed.query}) job_execution = catalog.add_job_execution( job=job, started_at=start_time, ended_at=end_time, status=JobExecutionStatus.SUCCESS, ) for source, target in zip(visited_query.source_columns, visited_query.target_columns): for column in source.columns: edge = catalog.add_column_lineage(column, target, job_execution.id, {}) logging.debug("Added {}".format(edge)) return job_execution
def _bind_target(self, catalog: Catalog, source: CatSource): target_table_visitor = RangeVarVisitor() target_table_visitor(self._insert_table) if target_table_visitor.is_qualified: schema = catalog.get_schema( source_name=source.name, schema_name=target_table_visitor.schema_name) elif source.default_schema is not None: schema = source.default_schema.schema else: raise SemanticError("No default schema set for source {}".format( source.fqdn)) self._target_table = catalog.add_table( table_name=target_table_visitor.name, schema=schema) sort_order = 1 for col in self._insert_columns: self._target_columns.append( catalog.add_column( column_name=col, data_type="varchar", sort_order=sort_order, table=self._target_table, ))
def scan_sources( catalog: Catalog, source_names: Optional[List[str]] = None, include_schema_regex: Optional[List[str]] = None, exclude_schema_regex: Optional[List[str]] = None, include_table_regex: Optional[List[str]] = None, exclude_table_regex: Optional[List[str]] = None, ): with catalog.managed_session: if source_names is not None and len(source_names) > 0: sources: List[CatSource] = [] for source_name in source_names: try: sources.append(catalog.get_source(source_name)) except NoResultFound: LOGGER.error("Source '%s' not found", source_name) else: sources = catalog.get_sources() LOGGER.info("%d sources will be scanned", len(sources)) for source in sources: scanner = DbScanner( catalog, source, include_schema_regex_str=include_schema_regex, exclude_schema_regex_str=exclude_schema_regex, include_table_regex_str=include_table_regex, exclude_table_regex_str=exclude_table_regex, ) LOGGER.info("Scanning {}".format(scanner.name)) try: scanner.scan() except StopIteration: raise NoMatchesError
def data_scan( catalog: Catalog, detectors: List[DatumDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], generator: Generator[Tuple[CatSchema, CatTable, CatColumn, str], None, None], sample_size: int = SMALL_TABLE_MAX, ): total_columns = _filter_text_columns([c for s, t, c in work_generator]) total_work = len(total_columns) * sample_size counter = 0 set_number = 0 for schema, table, column, val in tqdm(generator, total=total_work, desc="datum", unit="datum"): counter += 1 LOGGER.debug("Scanning column name %s", column.fqdn) if val is not None: for detector in detectors: type = detector.detect(column=column, datum=val) if type is not None: set_number += 1 catalog.set_column_pii_type(column=column, pii_type=type, pii_plugin=detector.name) LOGGER.debug("{} has {}".format(column.fqdn, type)) scan_logger.info("deep_scan", extra={ "column": column.fqdn, "pii_types": type }) data_logger.info( "deep_scan", extra={ "column": column.fqdn, "data": val, "pii_types": type }, ) break LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter, set_number)
def column_generator( catalog: Catalog, source: CatSource, last_run: Optional[datetime.datetime] = None, include_schema_regex_str: List[str] = None, exclude_schema_regex_str: List[str] = None, include_table_regex_str: List[str] = None, exclude_table_regex_str: List[str] = None, ) -> Generator[Tuple[CatSchema, CatTable, CatColumn], None, None]: try: for schema, table in table_generator( catalog=catalog, source=source, include_schema_regex_str=include_schema_regex_str, exclude_schema_regex_str=exclude_schema_regex_str, include_table_regex_str=include_table_regex_str, exclude_table_regex_str=exclude_table_regex_str, ): for column in catalog.get_columns_for_table(table=table, newer_than=last_run): LOGGER.debug( f"Scanning {schema.name}.{table.name}.{column.name}") yield schema, table, column except StopIteration: raise NoMatchesError
def create_graph(catalog: Catalog, visited_queries: List[DmlVisitor]) -> DbGraph: logger = LogMixin() job_ids = set() for query in visited_queries: job = catalog.add_job(query.name, {}) job_execution = catalog.add_job_execution(job, datetime.now(), datetime.now(), JobExecutionStatus.SUCCESS) for source, target in zip(query.source_columns, query.target_columns): edge = catalog.add_column_lineage(source, target, job_execution.id, {}) job_ids.add(job.id) logger.logger.debug("Added {}".format(edge)) graph = DbGraph(catalog, job_ids) graph.load() return graph
def _bind_target(self, catalog: Catalog, source: CatSource): target_table_visitor = RangeVarVisitor() target_table_visitor(self._insert_table) logging.debug("Searching for: {}".format( target_table_visitor.search_string)) try: self._target_table = catalog.search_table( source_like=source.name, **target_table_visitor.search_string) except RuntimeError as error: logging.debug(str(error)) raise TableNotFound( '"{schema_like}"."{table_like}" is not found'.format( **target_table_visitor.search_string)) logging.debug("Bound target table: {}".format(self._target_table)) if len(self._insert_columns) == 0: self._target_columns = catalog.get_columns_for_table( self._target_table) logging.debug("Bound all columns in {}".format(self._target_table)) else: bound_cols = catalog.get_columns_for_table( self._target_table, column_names=self._insert_columns) # Handle error case if len(bound_cols) != len(self._insert_columns): for column in self._insert_columns: found = False for bound in bound_cols: if column == bound.name: found = True break if not found: raise ColumnNotFound( '"{}" not found in the following tables: {}'. format( column, json.dumps([self._target_table], cls=CatTableEncoder), )) self._target_columns = bound_cols logging.debug("Bound {} target columns".format(len(bound_cols)))
def runserver(obj, port): logger = LogMixin() with open(obj, "r") as file: config = yaml.load(file, Loader=yaml.FullLoader) logger.logger.debug("Load config file: {}".format(obj)) logger.logger.debug(config) catalog = Catalog(**config["catalog"]) # elif config.snowflake is not None: # source = Snowflake(config.file) server = Server(port, catalog) server.run_server()
def table_generator( catalog: Catalog, source: CatSource, include_schema_regex_str: List[str] = None, exclude_schema_regex_str: List[str] = None, include_table_regex_str: List[str] = None, exclude_table_regex_str: List[str] = None, ) -> Generator[Tuple[CatSchema, CatTable], None, None]: schemata = filter_objects( include_schema_regex_str, exclude_schema_regex_str, [ CatalogObject(s.name, s.id) for s in catalog.search_schema(source_like=source.name, schema_like="%") ], ) for schema_object in schemata: schema = catalog.get_schema_by_id(schema_object.id) LOGGER.info("Generating schema %s", schema.name) table_objects = filter_objects( include_table_regex_str, exclude_table_regex_str, [ CatalogObject(t.name, t.id) for t in catalog.search_tables(source_like=source.name, schema_like=schema.name, table_like="%") ], ) for table_object in table_objects: table = catalog.get_table_by_id(table_object.id) LOGGER.info("Generating table %s", table.name) yield schema, table
def add_athena_source( catalog: Catalog, name: str, region_name: str, s3_staging_dir: str, aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None, ) -> CatSource: with catalog.commit_context: return catalog.add_source( name=name, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name, s3_staging_dir=s3_staging_dir, source_type="athena", )
def add_redshift_source( catalog: Catalog, name: str, username: str, password: str, database: str, uri: str, port: Optional[int] = None, ) -> CatSource: with catalog.commit_context: return catalog.add_source( name=name, username=username, password=password, database=database, uri=uri, port=port, source_type="redshift", )
def add_snowflake_source( catalog: Catalog, name: str, account: str, username: str, password: str, database: str, warehouse: str, role: str, ) -> CatSource: with catalog.commit_context: return catalog.add_source( name=name, username=username, password=password, database=database, account=account, warehouse=warehouse, role=role, source_type="snowflake", )
def data_generator( catalog: Catalog, source: CatSource, last_run: Optional[datetime.datetime] = None, include_schema_regex_str: List[str] = None, exclude_schema_regex_str: List[str] = None, include_table_regex_str: List[str] = None, exclude_table_regex_str: List[str] = None, sample_size=SMALL_TABLE_MAX, ) -> Generator[Tuple[CatSchema, CatTable, CatColumn, str], None, None]: for schema, table in table_generator( catalog=catalog, source=source, include_schema_regex_str=include_schema_regex_str, exclude_schema_regex_str=exclude_schema_regex_str, include_table_regex_str=include_table_regex_str, exclude_table_regex_str=exclude_table_regex_str, ): try: columns = _filter_text_columns( catalog.get_columns_for_table(table=table, newer_than=last_run)) if len(columns) > 0: for row in _row_generator( column_list=columns, schema=schema, table=table, source=source, sample_size=sample_size, ): for col, val in zip(columns, row): yield schema, table, col, val except StopIteration: raise NoMatchesError except exc.SQLAlchemyError as e: LOGGER.warning( f"Exception when getting data for {schema.name}.{table.name}. Code: {e.code}" )
def add_sqlite_source( catalog: Catalog, name: str, path: Path, ): with catalog.managed_session: catalog.add_source(name=name, uri=str(path), source_type="sqlite")
def scan_database( catalog: Catalog, source: CatSource, scan_type: ScanTypeEnum = ScanTypeEnum.metadata, incremental: bool = True, output_format: OutputFormat = OutputFormat.tabular, list_all: bool = False, include_schema_regex: List[str] = None, exclude_schema_regex: List[str] = None, include_table_regex: List[str] = None, exclude_table_regex: List[str] = None, sample_size: int = SMALL_TABLE_MAX, ) -> Union[List[Any], Dict[Any, Any]]: message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \ exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format( source_name=source.name, scan_type=str(scan_type), include_schema=",".join(include_schema_regex) if include_schema_regex is not None else "None", exclude_schema=",".join(exclude_schema_regex) if exclude_schema_regex is not None else "None", include_table=",".join(include_table_regex) if include_table_regex is not None else "None", exclude_table=",".join(exclude_table_regex) if exclude_table_regex is not None else "None", ) status_message = "Success" exit_code = 0 with catalog.managed_session: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) last_run: Optional[datetime.datetime] = None if incremental: last_task = catalog.get_latest_task("piicatcher.{}".format( source.name)) last_run = last_task.updated_at if last_task is not None else None if last_run is not None: LOGGER.debug("Last Run at {}", last_run) else: LOGGER.debug("No last run found") try: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) if scan_type == ScanTypeEnum.metadata: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, MetadataDetector) ] metadata_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), ) else: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, DatumDetector) ] data_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=data_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, sample_size=sample_size, ), sample_size=sample_size, ) if output_format == OutputFormat.tabular: return output_tabular(catalog=catalog, source=source, list_all=list_all, last_run=last_run) else: return output_dict(catalog=catalog, source=source, list_all=list_all, last_run=last_run) except Exception as e: status_message = str(e) exit_code = 1 raise e finally: catalog.add_task( "piicatcher.{}".format(source.name), exit_code, "{}.{}".format(message, status_message), )
def bind(self, catalog: Catalog): target_table_visitor = RangeVarVisitor() target_table_visitor.visit(self._target_table) self.logger.debug( "Searching for: {}".format(target_table_visitor.search_string) ) self._target_table = catalog.search_table(**target_table_visitor.search_string) self.logger.debug("Bound target table: {}".format(self._target_table)) if len(self._target_columns) == 0: self._target_columns = catalog.get_columns_for_table(self._target_table) self.logger.debug("Bound all columns in {}".format(self._target_table)) else: bound_cols = catalog.get_columns_for_table( self._target_table, column_names=self._target_columns ) # Handle error case if len(bound_cols) != len(self._target_columns): for column in self._target_columns: found = False for bound in bound_cols: if column == bound.name: found = True break if not found: raise RuntimeError("'{}' column is not found".format(column)) self._target_columns = bound_cols self.logger.debug("Bound {} target columns".format(len(bound_cols))) alias_map = {} bound_tables = [] for table in self._source_tables: visitor = RangeVarVisitor() visitor.visit(table) if visitor.alias is not None: alias_map[visitor.alias] = visitor.search_string self.logger.debug("Searching for: {}".format(visitor.search_string)) candidate_table = catalog.search_table(**visitor.search_string) self.logger.debug("Bound source table: {}".format(candidate_table)) bound_tables.append(candidate_table) self._source_tables = bound_tables bound_cols = [] for column in self._source_columns: column_ref_visitor = ColumnRefVisitor() column_ref_visitor.visit(column) if column_ref_visitor.name[0] in alias_map: table_name = alias_map[column_ref_visitor.name[0]] else: table_name = {"table_like": column_ref_visitor.name[0]} self.logger.debug("Searching for: {}".format(table_name)) candidate_table = catalog.search_table(**table_name) bound = catalog.get_columns_for_table( table=candidate_table, column_names=[column_ref_visitor.name[1]] ) if len(bound) == 0: raise RuntimeError("{} not found in table".format(column)) elif len(bound) > 1: raise RuntimeError("Ambiguous column name. Multiple matches found") self.logger.debug("Bound source column: {}".format(bound[0])) bound_cols.append(bound[0]) self._source_columns = bound_cols