def sync_table(mssql_conn, config, catalog_entry, state, columns): mssql_conn = MSSQLConnection(config) common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get("replication-key") replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, "replication_key") replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key_value") else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "replication_key", replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, "replication_key_value") stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "version", stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) singer.write_message(activate_version_message) LOGGER.info("Beginning SQL") with connect_with_backoff(mssql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == "date-time": replication_key_value = pendulum.parse( replication_key_value) select_sql += " WHERE \"{}\" >= %(replication_key_value)s ORDER BY \"{}\" ASC".format( replication_key_metadata, replication_key_metadata) params["replication_key_value"] = replication_key_value elif replication_key_metadata is not None: select_sql += " ORDER BY \"{}\" ASC".format( replication_key_metadata) common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
def log_server_params(mssql_conn): with connect_with_backoff(mssql_conn) as open_conn: try: with open_conn.cursor() as cur: cur.execute("""SELECT @@VERSION as version, @@lock_timeout as lock_wait_timeout""") row = cur.fetchone() LOGGER.info( "Server Parameters: " + "version: %s, " + "lock_timeout: %s, ", *row, ) except: LOGGER.warning("Encountered error checking server params. Error: (%s) %s", *e.args)
def sync_table(mssql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) perform_resumable_sync = sync_is_resumable(mssql_conn, catalog_entry) pk_clause = "" with connect_with_backoff(mssql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if perform_resumable_sync: LOGGER.info( "Full table sync is resumable based on primary key definition, will replicate incrementally" ) state = update_incremental_full_table_state( catalog_entry, state, cur) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(mssql_conn, config, catalog_entry, state, columns, stream_version): mssql_conn = MSSQLConnection(config) common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get("bookmarks", {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if "version" in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete") state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, "version") activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) with connect_with_backoff(mssql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, "max_pk_values") singer.clear_bookmark(state, catalog_entry.tap_stream_id, "last_pk_fetched") singer.write_message(activate_version_message)
def sync_is_resumable(mysql_conn, catalog_entry): ''' In order to resume a full table sync, a table requires ''' database_name = common.get_database_name(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if not key_properties: return False sql = """SELECT data_type FROM information_schema.columns WHERE table_schema = '{}' AND table_name = '{}' AND column_name = '{}' """ with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: for pk in key_properties: cur.execute(sql.format(database_name, catalog_entry.table, pk)) result = cur.fetchone() if not result: raise Exception( "Primary key column {} does not exist.".format(pk)) if result[0] not in RESUMABLE_PK_TYPES: LOGGER.warn( "Found primary key column %s with type %s. Will not be able " + "to resume interrupted FULL_TABLE sync using this key.", pk, result[0]) return False return True
def discover_catalog(mssql_conn, config): """Returns a Catalog describing the structure of the database.""" LOGGER.info("Preparing Catalog") mssql_conn = MSSQLConnection(config) filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join( ["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE c.table_schema IN ({})".format( filter_dbs_clause) else: table_schema_clause = """ WHERE c.TABLE_SCHEMA NOT IN ( 'information_schema', 'performance_schema', 'sys' )""" with connect_with_backoff(mssql_conn) as open_conn: cur = open_conn.cursor() LOGGER.info("Fetching tables") cur.execute("""SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE FROM INFORMATION_SCHEMA.TABLES c {} """.format(table_schema_clause)) table_info = {} for (db, table, table_type) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = { "row_count": None, "is_view": table_type == "VIEW" } LOGGER.info("Tables fetched, fetching columns") cur.execute("""with constraint_columns as ( select c.TABLE_SCHEMA , c.TABLE_NAME , c.COLUMN_NAME from INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE c join INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc on tc.TABLE_SCHEMA = c.TABLE_SCHEMA and tc.TABLE_NAME = c.TABLE_NAME and tc.CONSTRAINT_NAME = c.CONSTRAINT_NAME and tc.CONSTRAINT_TYPE in ('PRIMARY KEY', 'UNIQUE')) SELECT c.TABLE_SCHEMA, c.TABLE_NAME, c.COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, case when cc.COLUMN_NAME is null then 0 else 1 end FROM INFORMATION_SCHEMA.COLUMNS c left join constraint_columns cc on cc.TABLE_NAME = c.TABLE_NAME and cc.TABLE_SCHEMA = c.TABLE_SCHEMA and cc.COLUMN_NAME = c.COLUMN_NAME {} ORDER BY c.TABLE_SCHEMA, c.TABLE_NAME """.format(table_schema_clause)) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() LOGGER.info("Columns Fetched") entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( "row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) key_properties = [ c.column_name for c in cols if c.is_primary_key == 1 ] md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema, ) entries.append(entry) LOGGER.info("Catalog ready") return Catalog(entries)
def discover_catalog(mssql_conn, config): """Returns a Catalog describing the structure of the database.""" LOGGER.info("Preparing Catalog") mssql_conn = MSSQLConnection(config) filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE c.table_schema IN ({})".format(filter_dbs_clause) else: table_schema_clause = """ WHERE c.table_schema NOT IN ( 'information_schema', 'performance_schema', 'sys' )""" with connect_with_backoff(mssql_conn) as open_conn: cur = open_conn.cursor() LOGGER.info("Fetching tables") cur.execute( """SELECT table_schema, table_name, table_type FROM information_schema.tables c {} """.format( table_schema_clause ) ) table_info = {} for (db, table, table_type) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = {"row_count": None, "is_view": table_type == "VIEW"} LOGGER.info("Tables fetched, fetching columns") cur.execute( """with constraint_columns as ( select c.table_schema , c.table_name , c.column_name from information_schema.constraint_column_usage c join information_schema.table_constraints tc on tc.table_schema = c.table_schema and tc.table_name = c.table_name and tc.constraint_name = c.constraint_name and tc.constraint_type in ('PRIMARY KEY', 'UNIQUE')) SELECT c.table_schema, c.table_name, c.column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, case when cc.column_name is null then 0 else 1 end FROM information_schema.columns c left join constraint_columns cc on cc.table_name = c.table_name and cc.table_schema = c.table_schema and cc.column_name = c.column_name {} ORDER BY c.table_schema, c.table_name """.format( table_schema_clause ) ) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() LOGGER.info("Columns Fetched") entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols} ) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[table_schema]: row_count = table_info[table_schema][table_name].get("row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) key_properties = [c.column_name for c in cols if c.is_primary_key == 1] md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id(table_schema, table_name), schema=schema, ) entries.append(entry) LOGGER.info("Catalog ready") return Catalog(entries)
def discover(conn, config): with connect_with_backoff(conn) as open_conn: with open_conn.cursor() as cur: cur.execute(""" SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES """) table_info = {} schemas = cur.fetchall() for (db, schema, table, table_type) in schemas: if db not in table_info: table_info[db] = {} if schema not in table_info[db]: table_info[db][schema] = {} table_info[db][schema][table] = { # 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(""" SELECT C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION, C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE FROM INFORMATION_SCHEMA.COLUMNS C LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME """) # res = cur.fetchall() columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[db][table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.constraint_type == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries) raw_schemas = load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [] stream_key_properties = [] # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata': [], 'key_properties': [] } streams.append(catalog_entry) return {'streams': streams}
def runTest(self): connection = test_utils.get_test_connection() with connect_with_backoff(connection) as conn: result = conn.execute_scalar('SELECT 1 + 1') self.assertEqual(result, 2)