Example #1
0
def sync_table(mssql_conn, config, catalog_entry, state, columns):
    mssql_conn = MSSQLConnection(config)
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get("replication-key")
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                "replication_key")

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, "replication_key_value")
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key",
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key_value")

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  "version", stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)
    LOGGER.info("Beginning SQL")
    with connect_with_backoff(mssql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == "date-time":
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                select_sql += " WHERE \"{}\" >= %(replication_key_value)s ORDER BY \"{}\" ASC".format(
                    replication_key_metadata, replication_key_metadata)

                params["replication_key_value"] = replication_key_value
            elif replication_key_metadata is not None:
                select_sql += " ORDER BY \"{}\" ASC".format(
                    replication_key_metadata)

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
Example #2
0
def log_server_params(mssql_conn):
    with connect_with_backoff(mssql_conn) as open_conn:
        try:
            with open_conn.cursor() as cur:
                cur.execute("""SELECT @@VERSION as version, @@lock_timeout as lock_wait_timeout""")
                row = cur.fetchone()
                LOGGER.info(
                    "Server Parameters: " + "version: %s, " + "lock_timeout: %s, ", *row,
                )
        except:
            LOGGER.warning("Encountered error checking server params. Error: (%s) %s", *e.args)
Example #3
0
def sync_table(mssql_conn, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    perform_resumable_sync = sync_is_resumable(mssql_conn, catalog_entry)

    pk_clause = ""

    with connect_with_backoff(mssql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if perform_resumable_sync:
                LOGGER.info(
                    "Full table sync is resumable based on primary key definition, will replicate incrementally"
                )

                state = update_incremental_full_table_state(
                    catalog_entry, state, cur)
                pk_clause = generate_pk_clause(catalog_entry, state)

            select_sql += pk_clause
            params = {}

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
def sync_table(mssql_conn, config, catalog_entry, state, columns,
               stream_version):
    mssql_conn = MSSQLConnection(config)
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get("bookmarks", {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if "version" in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, "initial_full_table_complete")

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        "version")

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    with connect_with_backoff(mssql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            params = {}

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, "max_pk_values")
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          "last_pk_fetched")

    singer.write_message(activate_version_message)
Example #5
0
def sync_is_resumable(mysql_conn, catalog_entry):
    ''' In order to resume a full table sync, a table requires
    '''
    database_name = common.get_database_name(catalog_entry)
    key_properties = common.get_key_properties(catalog_entry)

    if not key_properties:
        return False

    sql = """SELECT data_type
               FROM information_schema.columns
              WHERE table_schema = '{}'
                AND table_name = '{}'
                AND column_name = '{}'
    """

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            for pk in key_properties:
                cur.execute(sql.format(database_name, catalog_entry.table, pk))

                result = cur.fetchone()

                if not result:
                    raise Exception(
                        "Primary key column {} does not exist.".format(pk))

                if result[0] not in RESUMABLE_PK_TYPES:
                    LOGGER.warn(
                        "Found primary key column %s with type %s. Will not be able "
                        +
                        "to resume interrupted FULL_TABLE sync using this key.",
                        pk, result[0])
                    return False

    return True
Example #6
0
def discover_catalog(mssql_conn, config):
    """Returns a Catalog describing the structure of the database."""
    LOGGER.info("Preparing Catalog")
    mssql_conn = MSSQLConnection(config)
    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(
            ["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE c.table_schema IN ({})".format(
            filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE c.TABLE_SCHEMA NOT IN (
        'information_schema',
        'performance_schema',
        'sys'
        )"""

    with connect_with_backoff(mssql_conn) as open_conn:
        cur = open_conn.cursor()
        LOGGER.info("Fetching tables")
        cur.execute("""SELECT TABLE_SCHEMA,
                TABLE_NAME,
                TABLE_TYPE
            FROM INFORMATION_SCHEMA.TABLES c
            {}
        """.format(table_schema_clause))
        table_info = {}

        for (db, table, table_type) in cur.fetchall():
            if db not in table_info:
                table_info[db] = {}

            table_info[db][table] = {
                "row_count": None,
                "is_view": table_type == "VIEW"
            }
        LOGGER.info("Tables fetched, fetching columns")
        cur.execute("""with constraint_columns as (
                select c.TABLE_SCHEMA
                , c.TABLE_NAME
                , c.COLUMN_NAME

                from INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE c

                join INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc
                        on tc.TABLE_SCHEMA = c.TABLE_SCHEMA
                        and tc.TABLE_NAME = c.TABLE_NAME
                        and tc.CONSTRAINT_NAME = c.CONSTRAINT_NAME
                        and tc.CONSTRAINT_TYPE in ('PRIMARY KEY', 'UNIQUE'))
                SELECT c.TABLE_SCHEMA,
                    c.TABLE_NAME,
                    c.COLUMN_NAME,
                    DATA_TYPE,
                    CHARACTER_MAXIMUM_LENGTH,
                    NUMERIC_PRECISION,
                    NUMERIC_SCALE,
                    case when cc.COLUMN_NAME is null then 0 else 1 end
                FROM INFORMATION_SCHEMA.COLUMNS c

                left join constraint_columns cc
                    on cc.TABLE_NAME = c.TABLE_NAME
                    and cc.TABLE_SCHEMA = c.TABLE_SCHEMA
                    and cc.COLUMN_NAME = c.COLUMN_NAME

                {}
                ORDER BY c.TABLE_SCHEMA, c.TABLE_NAME
        """.format(table_schema_clause))
        columns = []
        rec = cur.fetchone()
        while rec is not None:
            columns.append(Column(*rec))
            rec = cur.fetchone()
        LOGGER.info("Columns Fetched")
        entries = []
        for (k, cols) in itertools.groupby(
                columns, lambda c: (c.table_schema, c.table_name)):
            cols = list(cols)
            (table_schema, table_name) = k
            schema = Schema(
                type="object",
                properties={c.column_name: schema_for_column(c)
                            for c in cols})
            md = create_column_metadata(cols)
            md_map = metadata.to_map(md)

            md_map = metadata.write(md_map, (), "database-name", table_schema)

            is_view = table_info[table_schema][table_name]["is_view"]

            if table_schema in table_info and table_name in table_info[
                    table_schema]:
                row_count = table_info[table_schema][table_name].get(
                    "row_count")

                if row_count is not None:
                    md_map = metadata.write(md_map, (), "row-count", row_count)

                md_map = metadata.write(md_map, (), "is-view", is_view)

            key_properties = [
                c.column_name for c in cols if c.is_primary_key == 1
            ]

            md_map = metadata.write(md_map, (), "table-key-properties",
                                    key_properties)

            entry = CatalogEntry(
                table=table_name,
                stream=table_name,
                metadata=metadata.to_list(md_map),
                tap_stream_id=common.generate_tap_stream_id(
                    table_schema, table_name),
                schema=schema,
            )

            entries.append(entry)
    LOGGER.info("Catalog ready")
    return Catalog(entries)
Example #7
0
def discover_catalog(mssql_conn, config):
    """Returns a Catalog describing the structure of the database."""
    LOGGER.info("Preparing Catalog")
    mssql_conn = MSSQLConnection(config)
    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE c.table_schema IN ({})".format(filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE c.table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'sys'
        )"""

    with connect_with_backoff(mssql_conn) as open_conn:
        cur = open_conn.cursor()
        LOGGER.info("Fetching tables")
        cur.execute(
            """SELECT table_schema,
                table_name,
                table_type
            FROM information_schema.tables c
            {}
        """.format(
                table_schema_clause
            )
        )
        table_info = {}

        for (db, table, table_type) in cur.fetchall():
            if db not in table_info:
                table_info[db] = {}

            table_info[db][table] = {"row_count": None, "is_view": table_type == "VIEW"}
        LOGGER.info("Tables fetched, fetching columns")
        cur.execute(
            """with constraint_columns as (
                select c.table_schema
                , c.table_name
                , c.column_name

                from information_schema.constraint_column_usage c

                join information_schema.table_constraints tc
                        on tc.table_schema = c.table_schema
                        and tc.table_name = c.table_name
                        and tc.constraint_name = c.constraint_name
                        and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE'))
                SELECT c.table_schema,
                    c.table_name,
                    c.column_name,
                    data_type,
                    character_maximum_length,
                    numeric_precision,
                    numeric_scale,
                    case when cc.column_name is null then 0 else 1 end
                FROM information_schema.columns c

                left join constraint_columns cc
                    on cc.table_name = c.table_name
                    and cc.table_schema = c.table_schema
                    and cc.column_name = c.column_name

                {}
                ORDER BY c.table_schema, c.table_name
        """.format(
                table_schema_clause
            )
        )
        columns = []
        rec = cur.fetchone()
        while rec is not None:
            columns.append(Column(*rec))
            rec = cur.fetchone()
        LOGGER.info("Columns Fetched")
        entries = []
        for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)):
            cols = list(cols)
            (table_schema, table_name) = k
            schema = Schema(
                type="object", properties={c.column_name: schema_for_column(c) for c in cols}
            )
            md = create_column_metadata(cols)
            md_map = metadata.to_map(md)

            md_map = metadata.write(md_map, (), "database-name", table_schema)

            is_view = table_info[table_schema][table_name]["is_view"]

            if table_schema in table_info and table_name in table_info[table_schema]:
                row_count = table_info[table_schema][table_name].get("row_count")

                if row_count is not None:
                    md_map = metadata.write(md_map, (), "row-count", row_count)

                md_map = metadata.write(md_map, (), "is-view", is_view)

            key_properties = [c.column_name for c in cols if c.is_primary_key == 1]

            md_map = metadata.write(md_map, (), "table-key-properties", key_properties)

            entry = CatalogEntry(
                table=table_name,
                stream=table_name,
                metadata=metadata.to_list(md_map),
                tap_stream_id=common.generate_tap_stream_id(table_schema, table_name),
                schema=schema,
            )

            entries.append(entry)
    LOGGER.info("Catalog ready")
    return Catalog(entries)
Example #8
0
def discover(conn, config):

    with connect_with_backoff(conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES
            """)

            table_info = {}

            schemas = cur.fetchall()
            for (db, schema, table, table_type) in schemas:
                if db not in table_info:
                    table_info[db] = {}
                if schema not in table_info[db]:
                    table_info[db][schema] = {}

                table_info[db][schema][table] = {
                    # 'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
            SELECT
       C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION,
       C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE
FROM INFORMATION_SCHEMA.COLUMNS C
    LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME
    LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME
ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME
            """)
            # res = cur.fetchall()

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[db][table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.constraint_type == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

        return Catalog(entries)

    raw_schemas = load_schemas()
    streams = []

    for schema_name, schema in raw_schemas.items():
        # TODO: populate any metadata and stream's key properties here..
        stream_metadata = []
        stream_key_properties = []

        # create and add catalog entry
        catalog_entry = {
            'stream': schema_name,
            'tap_stream_id': schema_name,
            'schema': schema,
            'metadata': [],
            'key_properties': []
        }
        streams.append(catalog_entry)

    return {'streams': streams}
Example #9
0
    def runTest(self):
        connection = test_utils.get_test_connection()

        with connect_with_backoff(connection) as conn:
            result = conn.execute_scalar('SELECT 1 + 1')
            self.assertEqual(result, 2)