def get_inspectors(self) -> Iterable[Inspector]: # This method can be overridden in the case that you want to dynamically # run on multiple databases. url = self.config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) with engine.connect() as conn: self.get_catalog_metadata(conn) inspector = inspect(conn) yield inspector
def _get_all_tables(self) -> Set[str]: all_tables_query: str = textwrap.dedent("""\ SELECT database, name AS table_name FROM system.tables WHERE name NOT LIKE '.inner%'""") all_tables_set = set() url = self.config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) for db_row in engine.execute(text(all_tables_query)): all_tables_set.add(f'{db_row["database"]}.{db_row["table_name"]}') return all_tables_set
def _get_all_tables(self) -> Set[str]: all_tables_query: str = """ select table_schema as schemaname, table_name as tablename from information_schema.tables where table_type = 'BASE TABLE' and table_schema not in ('information_schema', 'pg_catalog', 'pg_internal') union select distinct schemaname, tablename from svv_external_tables union SELECT n.nspname AS schemaname ,c.relname AS tablename FROM pg_catalog.pg_class AS c INNER JOIN pg_catalog.pg_namespace AS n ON c.relnamespace = n.oid WHERE relkind = 'v' and n.nspname not in ('pg_catalog', 'information_schema') """ db_name = getattr(self.config, "database") db_alias = getattr(self.config, "database_alias") if db_alias: db_name = db_alias all_tables_set = set() url = self.config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) for db_row in engine.execute(all_tables_query): all_tables_set.add( f'{db_name}.{db_row["schemaname"]}.{db_row["tablename"]}') return all_tables_set
def get_catalog_metadata(self, conn: Connection) -> None: catalog_metadata = _get_external_db_mapping(conn) if catalog_metadata is None: return db_name = self.get_db_name() external_schema_mapping = {} for rel in catalog_metadata: if rel.eskind != 1: logger.debug( f"Skipping {rel.schemaname} for mapping to external database as currently we only " f"support glue" ) continue external_schema_mapping[rel.schemaname] = { "eskind": rel.eskind, "external_database": rel.databasename, "esoptions": rel.esoptions, "esoid": rel.esoid, "esowner": rel.esowner, } self.catalog_metadata[db_name] = external_schema_mapping
def _populate_lineage_map(self, query: str, lineage_type: LineageCollectorType) -> None: """ This method generate table level lineage based with the given query. The query should return the following columns: target_schema, target_table, source_table, source_schema :param query: The query to run to extract lineage. :type query: str :param lineage_type: The way the lineage should be processed :type lineage_type: LineageType return: The method does not return with anything as it directly modify the self._lineage_map property. :rtype: None """ assert self._lineage_map is not None if not self._all_tables_set: self._all_tables_set = self._get_all_tables() url = self.config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) try: for db_row in engine.execute(text(query)): if not self.config.schema_pattern.allowed( db_row["target_schema"] ) or not self.config.table_pattern.allowed( db_row["target_table"]): continue # Target target_path = ( f'{self.config.platform_instance+"." if self.config.platform_instance else ""}' f'{db_row["target_schema"]}.{db_row["target_table"]}') target = LineageItem( dataset=LineageDataset( platform=LineageDatasetPlatform.CLICKHOUSE, path=target_path), upstreams=set(), collector_type=lineage_type, ) # Source platform = LineageDatasetPlatform.CLICKHOUSE path = f'{db_row["source_schema"]}.{db_row["source_table"]}' sources = [LineageDataset( platform=platform, path=path, )] for source in sources: # Filtering out tables which does not exist in ClickHouse # It was deleted in the meantime if (source.platform == LineageDatasetPlatform.CLICKHOUSE and source.path not in self._all_tables_set): logger.warning(f"{source.path} missing table") continue target.upstreams.add(source) # Merging downstreams if dataset already exists and has downstreams if target.dataset.path in self._lineage_map: self._lineage_map[ target.dataset.path].upstreams = self._lineage_map[ target.dataset.path].upstreams.union( target.upstreams) else: self._lineage_map[target.dataset.path] = target logger.info( f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}" ) except Exception as e: logger.warning( f"Extracting {lineage_type.name} lineage from ClickHouse failed." f"Continuing...\nError was {e}.")
def _populate_lineage_map(self, query: str, lineage_type: LineageCollectorType) -> None: """ This method generate table level lineage based with the given query. The query should return the following columns: target_schema, target_table, source_table, source_schema source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table and source_schema will be extracted from the sql_field by sql parsing. :param query: The query to run to extract lineage. :type query: str :param lineage_type: The way the lineage should be processed :type lineage_type: LineageType return: The method does not return with anything as it directly modify the self._lineage_map property. :rtype: None """ assert self._lineage_map is not None if not self._all_tables_set: self._all_tables_set = self._get_all_tables() url = self.config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **self.config.options) db_name = self._get_db_name() try: for db_row in engine.execute(query): if not self.config.schema_pattern.allowed( db_row["target_schema"] ) or not self.config.table_pattern.allowed( db_row["target_table"]): continue # Target target_path = ( f'{db_name}.{db_row["target_schema"]}.{db_row["target_table"]}' ) target = LineageItem( dataset=LineageDataset( platform=LineageDatasetPlatform.REDSHIFT, path=target_path), upstreams=set(), collector_type=lineage_type, query_parser_failed_sqls=list(), ) sources: List[LineageDataset] = list() # Source if lineage_type in [ lineage_type.QUERY_SQL_PARSER, lineage_type.NON_BINDING_VIEW, ]: try: sources = self._get_sources_from_query( db_name=db_name, query=db_row["ddl"]) except Exception as e: target.query_parser_failed_sqls.append(db_row["ddl"]) logger.warning( f'Error parsing query {db_row["ddl"]} for getting lineage .' f"\nError was {e}.") else: if lineage_type == lineage_type.COPY: platform = LineageDatasetPlatform.S3 path = db_row["filename"].strip() if urlparse(path).scheme != "s3": logger.warning( f"Only s3 source supported with copy. The source was: {path}. ." ) continue else: platform = LineageDatasetPlatform.REDSHIFT path = f'{db_name}.{db_row["source_schema"]}.{db_row["source_table"]}' sources = [LineageDataset( platform=platform, path=path, )] for source in sources: # Filtering out tables which does not exist in Redshift # It was deleted in the meantime or query parser did not capture well the table name if (source.platform == LineageDatasetPlatform.REDSHIFT and source.path not in self._all_tables_set): logger.warning(f"{source.path} missing table") continue target.upstreams.add(source) # Merging downstreams if dataset already exists and has downstreams if target.dataset.path in self._lineage_map: self._lineage_map[ target.dataset.path].upstreams = self._lineage_map[ target.dataset.path].upstreams.union( target.upstreams) else: self._lineage_map[target.dataset.path] = target logger.info( f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}" ) except Exception as e: logger.warning( f"Extracting {lineage_type.name} lineage from Redshift failed." f"Continuing...\nError was {e}.")