def discover_catalog(mssql_conn, config): """Returns a Catalog describing the structure of the database.""" LOGGER.info("Preparing Catalog") mssql_conn = MSSQLConnection(config) filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE c.table_schema IN ({})".format(filter_dbs_clause) else: table_schema_clause = """ WHERE c.table_schema NOT IN ( 'information_schema', 'performance_schema', 'sys' )""" with connect_with_backoff(mssql_conn) as open_conn: cur = open_conn.cursor() LOGGER.info("Fetching tables") cur.execute( """SELECT table_schema, table_name, table_type FROM information_schema.tables c {} """.format( table_schema_clause ) ) table_info = {} for (db, table, table_type) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = {"row_count": None, "is_view": table_type == "VIEW"} LOGGER.info("Tables fetched, fetching columns") cur.execute( """with constraint_columns as ( select c.table_schema , c.table_name , c.column_name from information_schema.constraint_column_usage c join information_schema.table_constraints tc on tc.table_schema = c.table_schema and tc.table_name = c.table_name and tc.constraint_name = c.constraint_name and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE')) SELECT c.table_schema, c.table_name, c.column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, case when cc.column_name is null then 0 else 1 end FROM information_schema.columns c left join constraint_columns cc on cc.table_name = c.table_name and cc.table_schema = c.table_schema and cc.column_name = c.column_name {} ORDER BY c.table_schema, c.table_name """.format( table_schema_clause ) ) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() LOGGER.info("Columns Fetched") entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols} ) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[table_schema]: row_count = table_info[table_schema][table_name].get("row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) key_properties = [c.column_name for c in cols if c.is_primary_key == 1] md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id(table_schema, table_name), schema=schema, ) entries.append(entry) LOGGER.info("Catalog ready") return Catalog(entries)
def discover_catalog(mssql_conn, config): """Returns a Catalog describing the structure of the database.""" LOGGER.info("Preparing Catalog") mssql_conn = MSSQLConnection(config) filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join( ["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE c.table_schema IN ({})".format( filter_dbs_clause) else: table_schema_clause = """ WHERE c.TABLE_SCHEMA NOT IN ( 'information_schema', 'performance_schema', 'sys' )""" with connect_with_backoff(mssql_conn) as open_conn: cur = open_conn.cursor() LOGGER.info("Fetching tables") cur.execute("""SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE FROM INFORMATION_SCHEMA.TABLES c {} """.format(table_schema_clause)) table_info = {} for (db, table, table_type) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = { "row_count": None, "is_view": table_type == "VIEW" } LOGGER.info("Tables fetched, fetching columns") cur.execute("""with constraint_columns as ( select c.TABLE_SCHEMA , c.TABLE_NAME , c.COLUMN_NAME from INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE c join INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc on tc.TABLE_SCHEMA = c.TABLE_SCHEMA and tc.TABLE_NAME = c.TABLE_NAME and tc.CONSTRAINT_NAME = c.CONSTRAINT_NAME and tc.CONSTRAINT_TYPE in ('PRIMARY KEY', 'UNIQUE')) SELECT c.TABLE_SCHEMA, c.TABLE_NAME, c.COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, case when cc.COLUMN_NAME is null then 0 else 1 end FROM INFORMATION_SCHEMA.COLUMNS c left join constraint_columns cc on cc.TABLE_NAME = c.TABLE_NAME and cc.TABLE_SCHEMA = c.TABLE_SCHEMA and cc.COLUMN_NAME = c.COLUMN_NAME {} ORDER BY c.TABLE_SCHEMA, c.TABLE_NAME """.format(table_schema_clause)) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() LOGGER.info("Columns Fetched") entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( "row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) key_properties = [ c.column_name for c in cols if c.is_primary_key == 1 ] md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema, ) entries.append(entry) LOGGER.info("Catalog ready") return Catalog(entries)
def discover(conn, config): with connect_with_backoff(conn) as open_conn: with open_conn.cursor() as cur: cur.execute(""" SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES """) table_info = {} schemas = cur.fetchall() for (db, schema, table, table_type) in schemas: if db not in table_info: table_info[db] = {} if schema not in table_info[db]: table_info[db][schema] = {} table_info[db][schema][table] = { # 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(""" SELECT C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION, C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE FROM INFORMATION_SCHEMA.COLUMNS C LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME """) # res = cur.fetchall() columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[db][table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.constraint_type == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries) raw_schemas = load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [] stream_key_properties = [] # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata': [], 'key_properties': [] } streams.append(catalog_entry) return {'streams': streams}