Exemple #1
0
def metadata_scan(
    catalog: Catalog,
    detectors: List[MetadataDetector],
    work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None,
                              None],
    generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],
):
    total_columns = len([c for s, t, c in work_generator])

    counter = 0
    set_number = 0
    for schema, table, column in tqdm(generator,
                                      total=total_columns,
                                      desc="columns",
                                      unit="columns"):
        counter += 1
        LOGGER.debug("Scanning column name %s", column.fqdn)
        for detector in detectors:
            type = detector.detect(column)
            if type is not None:
                set_number += 1
                catalog.set_column_pii_type(column=column,
                                            pii_type=type,
                                            pii_plugin=detector.name)
                break

    LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter,
                set_number)
Exemple #2
0
def extract_lineage(
    catalog: Catalog,
    visited_query: DmlVisitor,
    source: CatSource,
    parsed: Parsed,
    start_time,
    end_time,
) -> JobExecution:
    job = catalog.add_job(name=parsed.name,
                          source=source,
                          context={"query": parsed.query})
    job_execution = catalog.add_job_execution(
        job=job,
        started_at=start_time,
        ended_at=end_time,
        status=JobExecutionStatus.SUCCESS,
    )
    for source, target in zip(visited_query.source_columns,
                              visited_query.target_columns):
        for column in source.columns:
            edge = catalog.add_column_lineage(column, target, job_execution.id,
                                              {})
            logging.debug("Added {}".format(edge))

    return job_execution
Exemple #3
0
    def _bind_target(self, catalog: Catalog, source: CatSource):
        target_table_visitor = RangeVarVisitor()
        target_table_visitor(self._insert_table)

        if target_table_visitor.is_qualified:
            schema = catalog.get_schema(
                source_name=source.name,
                schema_name=target_table_visitor.schema_name)
        elif source.default_schema is not None:
            schema = source.default_schema.schema
        else:
            raise SemanticError("No default schema set for source {}".format(
                source.fqdn))

        self._target_table = catalog.add_table(
            table_name=target_table_visitor.name, schema=schema)

        sort_order = 1
        for col in self._insert_columns:
            self._target_columns.append(
                catalog.add_column(
                    column_name=col,
                    data_type="varchar",
                    sort_order=sort_order,
                    table=self._target_table,
                ))
Exemple #4
0
def scan_sources(
    catalog: Catalog,
    source_names: Optional[List[str]] = None,
    include_schema_regex: Optional[List[str]] = None,
    exclude_schema_regex: Optional[List[str]] = None,
    include_table_regex: Optional[List[str]] = None,
    exclude_table_regex: Optional[List[str]] = None,
):
    with catalog.managed_session:
        if source_names is not None and len(source_names) > 0:
            sources: List[CatSource] = []
            for source_name in source_names:
                try:
                    sources.append(catalog.get_source(source_name))
                except NoResultFound:
                    LOGGER.error("Source '%s' not found", source_name)
        else:
            sources = catalog.get_sources()

        LOGGER.info("%d sources will be scanned", len(sources))
        for source in sources:
            scanner = DbScanner(
                catalog,
                source,
                include_schema_regex_str=include_schema_regex,
                exclude_schema_regex_str=exclude_schema_regex,
                include_table_regex_str=include_table_regex,
                exclude_table_regex_str=exclude_table_regex,
            )
            LOGGER.info("Scanning {}".format(scanner.name))
            try:
                scanner.scan()
            except StopIteration:
                raise NoMatchesError
Exemple #5
0
def data_scan(
    catalog: Catalog,
    detectors: List[DatumDetector],
    work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None,
                              None],
    generator: Generator[Tuple[CatSchema, CatTable, CatColumn, str], None,
                         None],
    sample_size: int = SMALL_TABLE_MAX,
):
    total_columns = _filter_text_columns([c for s, t, c in work_generator])
    total_work = len(total_columns) * sample_size

    counter = 0
    set_number = 0

    for schema, table, column, val in tqdm(generator,
                                           total=total_work,
                                           desc="datum",
                                           unit="datum"):
        counter += 1
        LOGGER.debug("Scanning column name %s", column.fqdn)
        if val is not None:
            for detector in detectors:
                type = detector.detect(column=column, datum=val)
                if type is not None:
                    set_number += 1

                    catalog.set_column_pii_type(column=column,
                                                pii_type=type,
                                                pii_plugin=detector.name)
                    LOGGER.debug("{} has {}".format(column.fqdn, type))

                    scan_logger.info("deep_scan",
                                     extra={
                                         "column": column.fqdn,
                                         "pii_types": type
                                     })
                    data_logger.info(
                        "deep_scan",
                        extra={
                            "column": column.fqdn,
                            "data": val,
                            "pii_types": type
                        },
                    )
                    break
    LOGGER.info("Columns Scanned: %d, Columns Labeled: %d", counter,
                set_number)
Exemple #6
0
def column_generator(
    catalog: Catalog,
    source: CatSource,
    last_run: Optional[datetime.datetime] = None,
    include_schema_regex_str: List[str] = None,
    exclude_schema_regex_str: List[str] = None,
    include_table_regex_str: List[str] = None,
    exclude_table_regex_str: List[str] = None,
) -> Generator[Tuple[CatSchema, CatTable, CatColumn], None, None]:

    try:
        for schema, table in table_generator(
                catalog=catalog,
                source=source,
                include_schema_regex_str=include_schema_regex_str,
                exclude_schema_regex_str=exclude_schema_regex_str,
                include_table_regex_str=include_table_regex_str,
                exclude_table_regex_str=exclude_table_regex_str,
        ):

            for column in catalog.get_columns_for_table(table=table,
                                                        newer_than=last_run):
                LOGGER.debug(
                    f"Scanning {schema.name}.{table.name}.{column.name}")
                yield schema, table, column
    except StopIteration:
        raise NoMatchesError
def create_graph(catalog: Catalog,
                 visited_queries: List[DmlVisitor]) -> DbGraph:
    logger = LogMixin()
    job_ids = set()
    for query in visited_queries:
        job = catalog.add_job(query.name, {})
        job_execution = catalog.add_job_execution(job, datetime.now(),
                                                  datetime.now(),
                                                  JobExecutionStatus.SUCCESS)
        for source, target in zip(query.source_columns, query.target_columns):
            edge = catalog.add_column_lineage(source, target, job_execution.id,
                                              {})
            job_ids.add(job.id)
            logger.logger.debug("Added {}".format(edge))

    graph = DbGraph(catalog, job_ids)
    graph.load()
    return graph
Exemple #8
0
    def _bind_target(self, catalog: Catalog, source: CatSource):
        target_table_visitor = RangeVarVisitor()
        target_table_visitor(self._insert_table)
        logging.debug("Searching for: {}".format(
            target_table_visitor.search_string))
        try:
            self._target_table = catalog.search_table(
                source_like=source.name, **target_table_visitor.search_string)
        except RuntimeError as error:
            logging.debug(str(error))
            raise TableNotFound(
                '"{schema_like}"."{table_like}" is not found'.format(
                    **target_table_visitor.search_string))
        logging.debug("Bound target table: {}".format(self._target_table))
        if len(self._insert_columns) == 0:
            self._target_columns = catalog.get_columns_for_table(
                self._target_table)
            logging.debug("Bound all columns in {}".format(self._target_table))
        else:
            bound_cols = catalog.get_columns_for_table(
                self._target_table, column_names=self._insert_columns)
            # Handle error case
            if len(bound_cols) != len(self._insert_columns):
                for column in self._insert_columns:
                    found = False
                    for bound in bound_cols:
                        if column == bound.name:
                            found = True
                            break

                    if not found:
                        raise ColumnNotFound(
                            '"{}" not found in the following tables: {}'.
                            format(
                                column,
                                json.dumps([self._target_table],
                                           cls=CatTableEncoder),
                            ))

            self._target_columns = bound_cols
            logging.debug("Bound {} target columns".format(len(bound_cols)))
Exemple #9
0
def runserver(obj, port):
    logger = LogMixin()
    with open(obj, "r") as file:
        config = yaml.load(file, Loader=yaml.FullLoader)

    logger.logger.debug("Load config file: {}".format(obj))
    logger.logger.debug(config)
    catalog = Catalog(**config["catalog"])

    #    elif config.snowflake is not None:
    #        source = Snowflake(config.file)
    server = Server(port, catalog)
    server.run_server()
Exemple #10
0
def table_generator(
    catalog: Catalog,
    source: CatSource,
    include_schema_regex_str: List[str] = None,
    exclude_schema_regex_str: List[str] = None,
    include_table_regex_str: List[str] = None,
    exclude_table_regex_str: List[str] = None,
) -> Generator[Tuple[CatSchema, CatTable], None, None]:

    schemata = filter_objects(
        include_schema_regex_str,
        exclude_schema_regex_str,
        [
            CatalogObject(s.name, s.id)
            for s in catalog.search_schema(source_like=source.name,
                                           schema_like="%")
        ],
    )

    for schema_object in schemata:
        schema = catalog.get_schema_by_id(schema_object.id)
        LOGGER.info("Generating schema %s", schema.name)
        table_objects = filter_objects(
            include_table_regex_str,
            exclude_table_regex_str,
            [
                CatalogObject(t.name, t.id)
                for t in catalog.search_tables(source_like=source.name,
                                               schema_like=schema.name,
                                               table_like="%")
            ],
        )

        for table_object in table_objects:
            table = catalog.get_table_by_id(table_object.id)
            LOGGER.info("Generating table %s", table.name)
            yield schema, table
Exemple #11
0
def add_athena_source(
    catalog: Catalog,
    name: str,
    region_name: str,
    s3_staging_dir: str,
    aws_access_key_id: Optional[str] = None,
    aws_secret_access_key: Optional[str] = None,
) -> CatSource:
    with catalog.commit_context:
        return catalog.add_source(
            name=name,
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=region_name,
            s3_staging_dir=s3_staging_dir,
            source_type="athena",
        )
Exemple #12
0
def add_redshift_source(
    catalog: Catalog,
    name: str,
    username: str,
    password: str,
    database: str,
    uri: str,
    port: Optional[int] = None,
) -> CatSource:
    with catalog.commit_context:
        return catalog.add_source(
            name=name,
            username=username,
            password=password,
            database=database,
            uri=uri,
            port=port,
            source_type="redshift",
        )
Exemple #13
0
def add_snowflake_source(
    catalog: Catalog,
    name: str,
    account: str,
    username: str,
    password: str,
    database: str,
    warehouse: str,
    role: str,
) -> CatSource:
    with catalog.commit_context:
        return catalog.add_source(
            name=name,
            username=username,
            password=password,
            database=database,
            account=account,
            warehouse=warehouse,
            role=role,
            source_type="snowflake",
        )
Exemple #14
0
def data_generator(
    catalog: Catalog,
    source: CatSource,
    last_run: Optional[datetime.datetime] = None,
    include_schema_regex_str: List[str] = None,
    exclude_schema_regex_str: List[str] = None,
    include_table_regex_str: List[str] = None,
    exclude_table_regex_str: List[str] = None,
    sample_size=SMALL_TABLE_MAX,
) -> Generator[Tuple[CatSchema, CatTable, CatColumn, str], None, None]:

    for schema, table in table_generator(
            catalog=catalog,
            source=source,
            include_schema_regex_str=include_schema_regex_str,
            exclude_schema_regex_str=exclude_schema_regex_str,
            include_table_regex_str=include_table_regex_str,
            exclude_table_regex_str=exclude_table_regex_str,
    ):

        try:
            columns = _filter_text_columns(
                catalog.get_columns_for_table(table=table,
                                              newer_than=last_run))
            if len(columns) > 0:
                for row in _row_generator(
                        column_list=columns,
                        schema=schema,
                        table=table,
                        source=source,
                        sample_size=sample_size,
                ):
                    for col, val in zip(columns, row):
                        yield schema, table, col, val
        except StopIteration:
            raise NoMatchesError
        except exc.SQLAlchemyError as e:
            LOGGER.warning(
                f"Exception when getting data for {schema.name}.{table.name}. Code: {e.code}"
            )
Exemple #15
0
def add_sqlite_source(
    catalog: Catalog, name: str, path: Path,
):
    with catalog.managed_session:
        catalog.add_source(name=name, uri=str(path), source_type="sqlite")
Exemple #16
0
def scan_database(
    catalog: Catalog,
    source: CatSource,
    scan_type: ScanTypeEnum = ScanTypeEnum.metadata,
    incremental: bool = True,
    output_format: OutputFormat = OutputFormat.tabular,
    list_all: bool = False,
    include_schema_regex: List[str] = None,
    exclude_schema_regex: List[str] = None,
    include_table_regex: List[str] = None,
    exclude_table_regex: List[str] = None,
    sample_size: int = SMALL_TABLE_MAX,
) -> Union[List[Any], Dict[Any, Any]]:
    message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \
            exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format(
        source_name=source.name,
        scan_type=str(scan_type),
        include_schema=",".join(include_schema_regex)
        if include_schema_regex is not None else "None",
        exclude_schema=",".join(exclude_schema_regex)
        if exclude_schema_regex is not None else "None",
        include_table=",".join(include_table_regex)
        if include_table_regex is not None else "None",
        exclude_table=",".join(exclude_table_regex)
        if exclude_table_regex is not None else "None",
    )

    status_message = "Success"
    exit_code = 0

    with catalog.managed_session:
        scan_sources(
            catalog=catalog,
            source_names=[source.name],
            include_schema_regex=include_schema_regex,
            exclude_schema_regex=exclude_schema_regex,
            include_table_regex=include_table_regex,
            exclude_table_regex=exclude_table_regex,
        )

        last_run: Optional[datetime.datetime] = None
        if incremental:
            last_task = catalog.get_latest_task("piicatcher.{}".format(
                source.name))
            last_run = last_task.updated_at if last_task is not None else None
            if last_run is not None:
                LOGGER.debug("Last Run at {}", last_run)
            else:
                LOGGER.debug("No last run found")

        try:
            scan_sources(
                catalog=catalog,
                source_names=[source.name],
                include_schema_regex=include_schema_regex,
                exclude_schema_regex=exclude_schema_regex,
                include_table_regex=include_table_regex,
                exclude_table_regex=exclude_table_regex,
            )

            if scan_type == ScanTypeEnum.metadata:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, MetadataDetector)
                ]

                metadata_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                )
            else:
                detector_list = [
                    detector() for detector in
                    detectors.detector_registry.get_all().values()
                    if issubclass(detector, DatumDetector)
                ]

                data_scan(
                    catalog=catalog,
                    detectors=detector_list,
                    work_generator=column_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                    ),
                    generator=data_generator(
                        catalog=catalog,
                        source=source,
                        last_run=last_run,
                        exclude_schema_regex_str=exclude_schema_regex,
                        include_schema_regex_str=include_schema_regex,
                        exclude_table_regex_str=exclude_table_regex,
                        include_table_regex_str=include_table_regex,
                        sample_size=sample_size,
                    ),
                    sample_size=sample_size,
                )

            if output_format == OutputFormat.tabular:
                return output_tabular(catalog=catalog,
                                      source=source,
                                      list_all=list_all,
                                      last_run=last_run)
            else:
                return output_dict(catalog=catalog,
                                   source=source,
                                   list_all=list_all,
                                   last_run=last_run)
        except Exception as e:
            status_message = str(e)
            exit_code = 1
            raise e
        finally:
            catalog.add_task(
                "piicatcher.{}".format(source.name),
                exit_code,
                "{}.{}".format(message, status_message),
            )
Exemple #17
0
    def bind(self, catalog: Catalog):
        target_table_visitor = RangeVarVisitor()
        target_table_visitor.visit(self._target_table)

        self.logger.debug(
            "Searching for: {}".format(target_table_visitor.search_string)
        )
        self._target_table = catalog.search_table(**target_table_visitor.search_string)
        self.logger.debug("Bound target table: {}".format(self._target_table))

        if len(self._target_columns) == 0:
            self._target_columns = catalog.get_columns_for_table(self._target_table)
            self.logger.debug("Bound all columns in {}".format(self._target_table))
        else:
            bound_cols = catalog.get_columns_for_table(
                self._target_table, column_names=self._target_columns
            )
            # Handle error case
            if len(bound_cols) != len(self._target_columns):
                for column in self._target_columns:
                    found = False
                    for bound in bound_cols:
                        if column == bound.name:
                            found = True
                            break

                    if not found:
                        raise RuntimeError("'{}' column is not found".format(column))

            self._target_columns = bound_cols
            self.logger.debug("Bound {} target columns".format(len(bound_cols)))

        alias_map = {}
        bound_tables = []
        for table in self._source_tables:
            visitor = RangeVarVisitor()
            visitor.visit(table)
            if visitor.alias is not None:
                alias_map[visitor.alias] = visitor.search_string

            self.logger.debug("Searching for: {}".format(visitor.search_string))

            candidate_table = catalog.search_table(**visitor.search_string)
            self.logger.debug("Bound source table: {}".format(candidate_table))
            bound_tables.append(candidate_table)

        self._source_tables = bound_tables
        bound_cols = []
        for column in self._source_columns:
            column_ref_visitor = ColumnRefVisitor()
            column_ref_visitor.visit(column)
            if column_ref_visitor.name[0] in alias_map:
                table_name = alias_map[column_ref_visitor.name[0]]
            else:
                table_name = {"table_like": column_ref_visitor.name[0]}

            self.logger.debug("Searching for: {}".format(table_name))
            candidate_table = catalog.search_table(**table_name)

            bound = catalog.get_columns_for_table(
                table=candidate_table, column_names=[column_ref_visitor.name[1]]
            )
            if len(bound) == 0:
                raise RuntimeError("{} not found in table".format(column))
            elif len(bound) > 1:
                raise RuntimeError("Ambiguous column name. Multiple matches found")

            self.logger.debug("Bound source column: {}".format(bound[0]))
            bound_cols.append(bound[0])

        self._source_columns = bound_cols