def sync_table(connection, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): yield activate_version_message with connection.cursor() as cursor: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} for message in common.sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): yield message yield activate_version_message
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry) pk_clause = "" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if perform_resumable_sync: LOGGER.info( "Full table sync is resumable based on primary key definition, will replicate incrementally" ) state = update_incremental_full_table_state( catalog_entry, state, cur) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause try: select_sql = _create_temp_table(mysql_conn, catalog_entry, columns, pk_clause) except Exception as ex: logging.warning("creating temp table failed: {}".format( str(ex))) params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(mysql_conn, catalog_entry, state, columns, original_state_file=''): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream='%s_%s' % (common.get_database_name(catalog_entry), catalog_entry.stream), version=stream_version) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse( replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key_metadata, replication_key_metadata) params['replication_key_value'] = replication_key_value common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params, original_state_file)
def sync_table(connection, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key = replication_key_state or replication_key_metadata replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) yield singer.ActivateVersionMessage(stream=catalog_entry.stream, version=stream_version) with connection.cursor() as cursor: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key, replication_key) params['replication_key_value'] = replication_key_value elif replication_key is not None: select_sql += ' ORDER BY `{}` ASC'.format(replication_key) for message in common.sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): yield message
def sync_table(mysql_conn, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get("replication-key") replication_key_state = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key" ) replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key_value" ) else: state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "replication_key", replication_key_metadata ) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, "replication_key_value") stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "version", stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[replication_key_metadata].format == "date-time": replication_key_value = pendulum.parse(replication_key_value) select_sql += " WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC".format( replication_key_metadata, replication_key_metadata ) params["replication_key_value"] = replication_key_value elif replication_key_metadata is not None: select_sql += " ORDER BY `{}` ASC".format(replication_key_metadata) common.sync_query( cur, catalog_entry, state, select_sql, columns, stream_version, params )
def sync_binlog_stream(mysql_conn: MySQLConnection, config: Dict, binlog_streams_map: Dict[str, Any], state: Dict) -> None: """ Capture the binlog events created between the pos in the state and current Master position and creates Singer streams to be flushed to stdout Args: mysql_conn: mysql connection instance config: tap config binlog_streams_map: tables to stream using binlog state: the current state """ for tap_stream_id in binlog_streams_map: common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file = log_pos = gtid = None if config['use_gtid']: gtid = calculate_gtid_bookmark(mysql_conn, binlog_streams_map, state, config['engine']) else: log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) reader = None try: reader = create_binlog_stream_reader(config, log_file, log_pos, gtid) end_log_file, end_log_pos = fetch_current_log_file_and_pos(mysql_conn) LOGGER.info('Current Master binlog file and pos: %s %s', end_log_file, end_log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config, end_log_file, end_log_pos) except pymysql.err.OperationalError as ex: if ex.args[0] == 1236: LOGGER.error( 'Cannot resume logical replication from given GTID %s! This GTID might date back to before ' 'the new primary has been setup, connect to old primary and consume all binlog events to get ' 'a newer GTID then switch back.', gtid) raise finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top if reader: reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id, _ in binlog_streams_map.items(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = None try: slave_uuid = f"bi-reader-%04x" % random.getrandbits(64) reader = BinLogStreamReader( connection_settings={}, server_id=server_id, slave_uuid=slave_uuid, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper, ) LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config) finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(mysql_conn, config, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream='%s_%s' % (common.get_database_name(catalog_entry), catalog_entry.stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} # common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(connection, config, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') verify_binlog_config(connection, catalog_entry) verify_log_file_exists(connection, catalog_entry, log_file, log_pos) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) server_id = fetch_server_id(connection) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) table_path = (catalog_entry.database, catalog_entry.stream) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 for binlog_event in reader: if reader.log_file == log_file and reader.log_pos == log_pos: LOGGER.info( "Skipping event for log_file=%s and log_pos=%s as it was processed last sync", reader.log_file, reader.log_pos) continue if isinstance(binlog_event, RotateEvent): state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', binlog_event.next_binlog) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', binlog_event.position) elif (binlog_event.schema, binlog_event.table) == table_path: db_column_types = {c.name: c.type for c in binlog_event.columns} if isinstance(binlog_event, WriteRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, UpdateRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['after_values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, DeleteRowsEvent): for row in binlog_event.rows: event_ts = datetime.datetime.utcfromtimestamp( binlog_event.timestamp).replace(tzinfo=pytz.UTC) vals = row['values'] vals[SDC_DELETED_AT] = event_ts filtered_vals = { k: v for k, v in vals.items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', reader.log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', reader.log_pos) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: yield singer.StateMessage(value=copy.deepcopy(state)) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) key_props_are_auto_incrementing = pks_are_auto_incrementing(mysql_conn, catalog_entry) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if key_props_are_auto_incrementing: LOGGER.info("Detected auto-incrementing primary key(s) - will replicate incrementally") max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') or get_max_pk_values(cur, catalog_entry) if not max_pk_values: LOGGER.info("No max value for auto-incrementing PK found for table {}".format(catalog_entry.table)) else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values', max_pk_values) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(mysql_conn, catalog_entry, state, columns, limit=None): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) iterate_limit = True while iterate_limit: replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key_metadata, replication_key_metadata) params['replication_key_value'] = replication_key_value elif replication_key_metadata is not None: select_sql += ' ORDER BY `{}` ASC'.format(replication_key_metadata) if limit: select_sql += ' LIMIT {}'.format(limit) num_rows = common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) if limit is None or num_rows < limit: iterate_limit = False