def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # NB: Flag 0x1 indicates that the binlog has been closed successfully, so we can rely on this being a complete log. if hasattr(binlog_event, 'flags') and binlog_event.flags & LOG_EVENT_BINLOG_IN_USE_F: state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state)))
def discover_catalog(mysql_conn: Dict, dbs: str = None, tables: Optional[str] = None): """Returns a Catalog describing the structure of the database.""" if dbs: filter_dbs_clause = ",".join( [f"'{db_name}'" for db_name in dbs.split(",")]) table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})" else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql', 'sys' )""" tables_clause = '' if tables is not None and tables != '': filter_tables_clause = ",".join( [f"'{table_name}'" for table_name in tables.split(",")]) tables_clause = f" AND table_name IN ({filter_tables_clause})" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute(f""" SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {table_schema_clause}{tables_clause} """) table_info = {} for (db_name, table, table_type, rows) in cur.fetchall(): if db_name not in table_info: table_info[db_name] = {} table_info[db_name][table] = { 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(f""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {table_schema_clause}{tables_clause} ORDER BY table_schema, table_name """) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) mdata = create_column_metadata(cols) md_map = metadata.to_map(mdata) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.column_key == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)
def discover_catalog(mysql_conn, config): '''Returns a Catalog describing the structure of the database.''' filter_dbs_config = config.get('filter_dbs') if filter_dbs_config: filter_dbs_clause = ",".join( ["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE table_schema IN ({})".format( filter_dbs_clause) else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql' )""" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: table_schema_sql = """ SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {} """.format(table_schema_clause) logging.info("getting tables with {}".format( re.sub("\s{2,}", " ", table_schema_sql))) cur.execute(table_schema_sql) logging.info("successfully got table information") table_info = {} for (db, table, table_type, rows) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = { 'row_count': rows, 'is_view': table_type == 'VIEW' } table_schema_query = """ SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {} ORDER BY table_schema, table_name """.format(table_schema_clause) logging.info("getting table meta data with {}".format( re.sub("\s{2,}", " ", table_schema_query))) cur.execute(table_schema_query) logging.info("successfully got table meta data") columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.column_key == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)
def discover_catalog(mysql_conn, config): """Returns a Catalog describing the structure of the database.""" filter_dbs_config = config.get("filter_dbs") if filter_dbs_config: filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")]) table_schema_clause = "WHERE table_schema IN ({})".format(filter_dbs_clause) else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql', 'sys' )""" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute( """ SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {} """.format( table_schema_clause ) ) table_info = {} for (db, table, table_type, rows) in cur.fetchall(): if db not in table_info: table_info[db] = {} table_info[db][table] = {"row_count": rows, "is_view": table_type == "VIEW"} cur.execute( """ SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {} ORDER BY table_schema, table_name """.format( table_schema_clause ) ) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema( type="object", properties={c.column_name: schema_for_column(c) for c in cols} ) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), "database-name", table_schema) is_view = table_info[table_schema][table_name]["is_view"] if table_schema in table_info and table_name in table_info[table_schema]: row_count = table_info[table_schema][table_name].get("row_count") if row_count is not None: md_map = metadata.write(md_map, (), "row-count", row_count) md_map = metadata.write(md_map, (), "is-view", is_view) column_is_key_prop = lambda c, s: ( c.column_key == "PRI" and s.properties[c.column_name].inclusion != "unsupported" ) key_properties = [c.column_name for c in cols if column_is_key_prop(c, schema)] if not is_view: md_map = metadata.write(md_map, (), "table-key-properties", key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id(table_schema, table_name), schema=schema, ) entries.append(entry) return Catalog(entries)
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties # if a column no longer exists, the event will have something like __dropped_col_XY__ # to refer to this column, we don't want these columns to be included in the difference diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True, get_db_column_types(binlog_event).keys())).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: LOGGER.debug('Difference between event and schema: %s', diff) LOGGER.info('Running discovery ... ') # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected(new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop(col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message(catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties diff = set(get_db_column_types(binlog_event).keys()).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: #run discovery for the current table only catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function catalog_entry.stream = tap_stream_id desired_columns = list( catalog_entry.schema.properties.keys()) # Add the _sdc_deleted_at col add_automatic_properties(catalog_entry, desired_columns) # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = desired_columns # send the new scheme to target write_schema_message(catalog_entry=catalog_entry) if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader, binlog_streams_map: Dict, state: Dict, config: Dict, end_log_file: str, end_log_pos: int): processed_rows_events = 0 events_skipped = 0 log_file = None log_pos = None gtid_pos = reader.auto_position # initial gtid, we set this when we created the reader's instance # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types. # Saving them here to avoid doing the check if we should ignore a column over and over again ignored_columns = set() # Exit from the loop when the reader either runs out of streams to return or we reach # the end position (which is Master's) for binlog_event in reader: # get reader current binlog file and position log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if (log_file > end_log_file) or (end_log_file == log_file and log_pos >= end_log_pos): LOGGER.info( 'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!', log_file, log_pos) # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master # binlog file and position above, making the latter behind the stream reader and it causes some data loss # in the next run by skipping everything between end_log_file and log_pos # so we need to update log_pos back to master's position log_file = end_log_file log_pos = end_log_pos break if isinstance(binlog_event, RotateEvent): LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d', binlog_event.next_binlog, binlog_event.position) state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position, gtid_pos) elif isinstance(binlog_event, MariadbGtidEvent) or isinstance( binlog_event, GtidEvent): gtid_pos = binlog_event.gtid LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__, gtid_pos) state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) # There is strange behavior happening when using GTID in the pymysqlreplication lib, # explained here: https://github.com/noplay/python-mysql-replication/issues/367 # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart # consuming binlog from old GTID pos when connection to server is lost. reader.auto_position = gtid_pos else: time_extracted = utils.now() tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped += 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, processed_rows_events) else: # Compare event's columns to the schema properties diff = __get_diff_in_columns_list( binlog_event, catalog_entry.schema.properties.keys(), ignored_columns) # If there are additional cols in the event then run discovery if needed and update the catalog if diff: LOGGER.info( 'Stream `%s`: Difference detected between event and schema: %s', tap_stream_id, diff) md_map = metadata.to_map(catalog_entry.metadata) if not should_run_discovery(diff, md_map): LOGGER.info( 'Stream `%s`: Not running discovery. Ignoring all detected columns in %s', tap_stream_id, diff) ignored_columns = ignored_columns.union(diff) else: LOGGER.info('Stream `%s`: Running discovery ... ', tap_stream_id) # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected( new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the # same as the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop( col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message( catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): processed_rows_events = handle_write_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): processed_rows_events = handle_update_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): processed_rows_events = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update singer bookmark and send STATE message periodically if ((processed_rows_events and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) LOGGER.info('Processed %s rows', processed_rows_events) # Update singer bookmark at the last time to point it the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos)
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)