Beispiel #1
0
def discover_catalog(mssql_conn, config):
    """Returns a Catalog describing the structure of the database."""
    LOGGER.info("Preparing Catalog")
    mssql_conn = MSSQLConnection(config)
    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE c.table_schema IN ({})".format(filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE c.table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'sys'
        )"""

    with connect_with_backoff(mssql_conn) as open_conn:
        cur = open_conn.cursor()
        LOGGER.info("Fetching tables")
        cur.execute(
            """SELECT table_schema,
                table_name,
                table_type
            FROM information_schema.tables c
            {}
        """.format(
                table_schema_clause
            )
        )
        table_info = {}

        for (db, table, table_type) in cur.fetchall():
            if db not in table_info:
                table_info[db] = {}

            table_info[db][table] = {"row_count": None, "is_view": table_type == "VIEW"}
        LOGGER.info("Tables fetched, fetching columns")
        cur.execute(
            """with constraint_columns as (
                select c.table_schema
                , c.table_name
                , c.column_name

                from information_schema.constraint_column_usage c

                join information_schema.table_constraints tc
                        on tc.table_schema = c.table_schema
                        and tc.table_name = c.table_name
                        and tc.constraint_name = c.constraint_name
                        and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE'))
                SELECT c.table_schema,
                    c.table_name,
                    c.column_name,
                    data_type,
                    character_maximum_length,
                    numeric_precision,
                    numeric_scale,
                    case when cc.column_name is null then 0 else 1 end
                FROM information_schema.columns c

                left join constraint_columns cc
                    on cc.table_name = c.table_name
                    and cc.table_schema = c.table_schema
                    and cc.column_name = c.column_name

                {}
                ORDER BY c.table_schema, c.table_name
        """.format(
                table_schema_clause
            )
        )
        columns = []
        rec = cur.fetchone()
        while rec is not None:
            columns.append(Column(*rec))
            rec = cur.fetchone()
        LOGGER.info("Columns Fetched")
        entries = []
        for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)):
            cols = list(cols)
            (table_schema, table_name) = k
            schema = Schema(
                type="object", properties={c.column_name: schema_for_column(c) for c in cols}
            )
            md = create_column_metadata(cols)
            md_map = metadata.to_map(md)

            md_map = metadata.write(md_map, (), "database-name", table_schema)

            is_view = table_info[table_schema][table_name]["is_view"]

            if table_schema in table_info and table_name in table_info[table_schema]:
                row_count = table_info[table_schema][table_name].get("row_count")

                if row_count is not None:
                    md_map = metadata.write(md_map, (), "row-count", row_count)

                md_map = metadata.write(md_map, (), "is-view", is_view)

            key_properties = [c.column_name for c in cols if c.is_primary_key == 1]

            md_map = metadata.write(md_map, (), "table-key-properties", key_properties)

            entry = CatalogEntry(
                table=table_name,
                stream=table_name,
                metadata=metadata.to_list(md_map),
                tap_stream_id=common.generate_tap_stream_id(table_schema, table_name),
                schema=schema,
            )

            entries.append(entry)
    LOGGER.info("Catalog ready")
    return Catalog(entries)
Beispiel #2
0
def discover_catalog(mssql_conn, config):
    """Returns a Catalog describing the structure of the database."""
    LOGGER.info("Preparing Catalog")
    mssql_conn = MSSQLConnection(config)
    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(
            ["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE c.table_schema IN ({})".format(
            filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE c.TABLE_SCHEMA NOT IN (
        'information_schema',
        'performance_schema',
        'sys'
        )"""

    with connect_with_backoff(mssql_conn) as open_conn:
        cur = open_conn.cursor()
        LOGGER.info("Fetching tables")
        cur.execute("""SELECT TABLE_SCHEMA,
                TABLE_NAME,
                TABLE_TYPE
            FROM INFORMATION_SCHEMA.TABLES c
            {}
        """.format(table_schema_clause))
        table_info = {}

        for (db, table, table_type) in cur.fetchall():
            if db not in table_info:
                table_info[db] = {}

            table_info[db][table] = {
                "row_count": None,
                "is_view": table_type == "VIEW"
            }
        LOGGER.info("Tables fetched, fetching columns")
        cur.execute("""with constraint_columns as (
                select c.TABLE_SCHEMA
                , c.TABLE_NAME
                , c.COLUMN_NAME

                from INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE c

                join INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc
                        on tc.TABLE_SCHEMA = c.TABLE_SCHEMA
                        and tc.TABLE_NAME = c.TABLE_NAME
                        and tc.CONSTRAINT_NAME = c.CONSTRAINT_NAME
                        and tc.CONSTRAINT_TYPE in ('PRIMARY KEY', 'UNIQUE'))
                SELECT c.TABLE_SCHEMA,
                    c.TABLE_NAME,
                    c.COLUMN_NAME,
                    DATA_TYPE,
                    CHARACTER_MAXIMUM_LENGTH,
                    NUMERIC_PRECISION,
                    NUMERIC_SCALE,
                    case when cc.COLUMN_NAME is null then 0 else 1 end
                FROM INFORMATION_SCHEMA.COLUMNS c

                left join constraint_columns cc
                    on cc.TABLE_NAME = c.TABLE_NAME
                    and cc.TABLE_SCHEMA = c.TABLE_SCHEMA
                    and cc.COLUMN_NAME = c.COLUMN_NAME

                {}
                ORDER BY c.TABLE_SCHEMA, c.TABLE_NAME
        """.format(table_schema_clause))
        columns = []
        rec = cur.fetchone()
        while rec is not None:
            columns.append(Column(*rec))
            rec = cur.fetchone()
        LOGGER.info("Columns Fetched")
        entries = []
        for (k, cols) in itertools.groupby(
                columns, lambda c: (c.table_schema, c.table_name)):
            cols = list(cols)
            (table_schema, table_name) = k
            schema = Schema(
                type="object",
                properties={c.column_name: schema_for_column(c)
                            for c in cols})
            md = create_column_metadata(cols)
            md_map = metadata.to_map(md)

            md_map = metadata.write(md_map, (), "database-name", table_schema)

            is_view = table_info[table_schema][table_name]["is_view"]

            if table_schema in table_info and table_name in table_info[
                    table_schema]:
                row_count = table_info[table_schema][table_name].get(
                    "row_count")

                if row_count is not None:
                    md_map = metadata.write(md_map, (), "row-count", row_count)

                md_map = metadata.write(md_map, (), "is-view", is_view)

            key_properties = [
                c.column_name for c in cols if c.is_primary_key == 1
            ]

            md_map = metadata.write(md_map, (), "table-key-properties",
                                    key_properties)

            entry = CatalogEntry(
                table=table_name,
                stream=table_name,
                metadata=metadata.to_list(md_map),
                tap_stream_id=common.generate_tap_stream_id(
                    table_schema, table_name),
                schema=schema,
            )

            entries.append(entry)
    LOGGER.info("Catalog ready")
    return Catalog(entries)
Beispiel #3
0
def discover(conn, config):

    with connect_with_backoff(conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES
            """)

            table_info = {}

            schemas = cur.fetchall()
            for (db, schema, table, table_type) in schemas:
                if db not in table_info:
                    table_info[db] = {}
                if schema not in table_info[db]:
                    table_info[db][schema] = {}

                table_info[db][schema][table] = {
                    # 'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
            SELECT
       C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION,
       C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE
FROM INFORMATION_SCHEMA.COLUMNS C
    LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME
    LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME
ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME
            """)
            # res = cur.fetchall()

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[db][table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.constraint_type == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

        return Catalog(entries)

    raw_schemas = load_schemas()
    streams = []

    for schema_name, schema in raw_schemas.items():
        # TODO: populate any metadata and stream's key properties here..
        stream_metadata = []
        stream_key_properties = []

        # create and add catalog entry
        catalog_entry = {
            'stream': schema_name,
            'tap_stream_id': schema_name,
            'schema': schema,
            'metadata': [],
            'key_properties': []
        }
        streams.append(catalog_entry)

    return {'streams': streams}