Ejemplo n.º 1
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # NB: Flag 0x1 indicates that the binlog has been closed successfully, so we can rely on this being a complete log.
        if hasattr(binlog_event,
                   'flags') and binlog_event.flags & LOG_EVENT_BINLOG_IN_USE_F:
            state = update_bookmarks(state, binlog_streams_map,
                                     reader.log_file, reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))
def discover_catalog(mysql_conn: Dict,
                     dbs: str = None,
                     tables: Optional[str] = None):
    """Returns a Catalog describing the structure of the database."""

    if dbs:
        filter_dbs_clause = ",".join(
            [f"'{db_name}'" for db_name in dbs.split(",")])
        table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})"
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql',
        'sys'
        )"""

    tables_clause = ''

    if tables is not None and tables != '':
        filter_tables_clause = ",".join(
            [f"'{table_name}'" for table_name in tables.split(",")])
        tables_clause = f" AND table_name IN ({filter_tables_clause})"

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute(f"""
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {table_schema_clause}{tables_clause}
            """)

            table_info = {}

            for (db_name, table, table_type, rows) in cur.fetchall():
                if db_name not in table_info:
                    table_info[db_name] = {}

                table_info[db_name][table] = {
                    'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute(f"""
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {table_schema_clause}{tables_clause}
                    ORDER BY table_schema, table_name
            """)

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k

                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                mdata = create_column_metadata(cols)
                md_map = metadata.to_map(mdata)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

    return Catalog(entries)
Ejemplo n.º 3
0
def discover_catalog(mysql_conn, config):
    '''Returns a Catalog describing the structure of the database.'''

    filter_dbs_config = config.get('filter_dbs')

    if filter_dbs_config:
        filter_dbs_clause = ",".join(
            ["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE table_schema IN ({})".format(
            filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql'
        )"""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            table_schema_sql = """
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {}
            """.format(table_schema_clause)
            logging.info("getting tables with {}".format(
                re.sub("\s{2,}", " ", table_schema_sql)))
            cur.execute(table_schema_sql)
            logging.info("successfully got table information")

            table_info = {}

            for (db, table, table_type, rows) in cur.fetchall():
                if db not in table_info:
                    table_info[db] = {}

                table_info[db][table] = {
                    'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            table_schema_query = """
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {}
                    ORDER BY table_schema, table_name
            """.format(table_schema_clause)
            logging.info("getting table meta data with {}".format(
                re.sub("\s{2,}", " ", table_schema_query)))
            cur.execute(table_schema_query)
            logging.info("successfully got table meta data")

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

    return Catalog(entries)
Ejemplo n.º 4
0
def discover_catalog(mysql_conn, config):
    """Returns a Catalog describing the structure of the database."""

    filter_dbs_config = config.get("filter_dbs")

    if filter_dbs_config:
        filter_dbs_clause = ",".join(["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE table_schema IN ({})".format(filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql',
        'sys'
        )"""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute(
                """
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {}
            """.format(
                    table_schema_clause
                )
            )

            table_info = {}

            for (db, table, table_type, rows) in cur.fetchall():
                if db not in table_info:
                    table_info[db] = {}

                table_info[db][table] = {"row_count": rows, "is_view": table_type == "VIEW"}

            cur.execute(
                """
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {}
                    ORDER BY table_schema, table_name
            """.format(
                    table_schema_clause
                )
            )

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(
                    type="object", properties={c.column_name: schema_for_column(c) for c in cols}
                )
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), "database-name", table_schema)

                is_view = table_info[table_schema][table_name]["is_view"]

                if table_schema in table_info and table_name in table_info[table_schema]:
                    row_count = table_info[table_schema][table_name].get("row_count")

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), "row-count", row_count)

                    md_map = metadata.write(md_map, (), "is-view", is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == "PRI" and s.properties[c.column_name].inclusion != "unsupported"
                )

                key_properties = [c.column_name for c in cols if column_is_key_prop(c, schema)]

                if not is_view:
                    md_map = metadata.write(md_map, (), "table-key-properties", key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(table_schema, table_name),
                    schema=schema,
                )

                entries.append(entry)

    return Catalog(entries)
Ejemplo n.º 5
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id in binlog_streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = fetch_server_id(mysql_conn)
        LOGGER.info("No server_id provided, will use global server_id=%s",
                    server_id)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        state = update_bookmarks(state, binlog_streams_map, reader.log_file,
                                 reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 6
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                # if a column no longer exists, the event will have something like __dropped_col_XY__
                # to refer to this column, we don't want these columns to be included in the difference
                diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True,
                                  get_db_column_types(binlog_event).keys())).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    LOGGER.debug('Difference between event and schema: %s',
                                 diff)
                    LOGGER.info('Running discovery ... ')

                    # run discovery for the current table only
                    new_catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    selected = {
                        k
                        for k, v in
                        new_catalog_entry.schema.properties.items()
                        if common.property_is_selected(new_catalog_entry, k)
                    }

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    new_catalog_entry.stream = tap_stream_id

                    # These are the columns we need to select
                    new_columns = desired_columns(selected,
                                                  new_catalog_entry.schema)

                    cols = set(new_catalog_entry.schema.properties.keys())

                    # drop unsupported properties from schema
                    for col in cols:
                        if col not in new_columns:
                            new_catalog_entry.schema.properties.pop(col, None)

                    # Add the _sdc_deleted_at col
                    new_columns = add_automatic_properties(
                        new_catalog_entry, list(new_columns))

                    # send the new scheme to target if we have a new schema
                    if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                        write_schema_message(catalog_entry=new_catalog_entry)
                        catalog_entry = new_catalog_entry

                        # update this dictionary while we're at it
                        binlog_streams_map[tap_stream_id][
                            'catalog_entry'] = new_catalog_entry
                        binlog_streams_map[tap_stream_id][
                            'desired_columns'] = new_columns
                        columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Ejemplo n.º 7
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                diff = set(get_db_column_types(binlog_event).keys()).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    #run discovery for the current table only
                    catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    catalog_entry.stream = tap_stream_id
                    desired_columns = list(
                        catalog_entry.schema.properties.keys())

                    # Add the _sdc_deleted_at col
                    add_automatic_properties(catalog_entry, desired_columns)

                    # update this dictionary while we're at it
                    binlog_streams_map[tap_stream_id][
                        'catalog_entry'] = catalog_entry
                    binlog_streams_map[tap_stream_id][
                        'desired_columns'] = desired_columns

                    # send the new scheme to target
                    write_schema_message(catalog_entry=catalog_entry)

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Ejemplo n.º 8
0
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader,
                     binlog_streams_map: Dict, state: Dict, config: Dict,
                     end_log_file: str, end_log_pos: int):

    processed_rows_events = 0
    events_skipped = 0

    log_file = None
    log_pos = None
    gtid_pos = reader.auto_position  # initial gtid, we set this when we created the reader's instance

    # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types.
    # Saving them here to avoid doing the check if we should ignore a column over and over again
    ignored_columns = set()

    # Exit from the loop when the reader either runs out of streams to return or we reach
    # the end position (which is Master's)
    for binlog_event in reader:

        # get reader current binlog file and position
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if (log_file > end_log_file) or (end_log_file == log_file
                                         and log_pos >= end_log_pos):
            LOGGER.info(
                'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!',
                log_file, log_pos)

            # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master
            # binlog file and position above, making the latter behind the stream reader and it causes some data loss
            # in the next run by skipping everything between end_log_file and log_pos
            # so we need to update log_pos back to master's position
            log_file = end_log_file
            log_pos = end_log_pos

            break

        if isinstance(binlog_event, RotateEvent):
            LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d',
                         binlog_event.next_binlog, binlog_event.position)

            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position, gtid_pos)

        elif isinstance(binlog_event, MariadbGtidEvent) or isinstance(
                binlog_event, GtidEvent):
            gtid_pos = binlog_event.gtid

            LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__,
                         gtid_pos)

            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)

            # There is strange behavior happening when using GTID in the pymysqlreplication lib,
            # explained here: https://github.com/noplay/python-mysql-replication/issues/367
            # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart
            # consuming binlog from old GTID pos when connection to server is lost.
            reader.auto_position = gtid_pos

        else:
            time_extracted = utils.now()

            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped += 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, processed_rows_events)
            else:
                # Compare event's columns to the schema properties
                diff = __get_diff_in_columns_list(
                    binlog_event, catalog_entry.schema.properties.keys(),
                    ignored_columns)

                # If there are additional cols in the event then run discovery if needed and update the catalog
                if diff:

                    LOGGER.info(
                        'Stream `%s`: Difference detected between event and schema: %s',
                        tap_stream_id, diff)

                    md_map = metadata.to_map(catalog_entry.metadata)

                    if not should_run_discovery(diff, md_map):
                        LOGGER.info(
                            'Stream `%s`: Not running discovery. Ignoring all detected columns in %s',
                            tap_stream_id, diff)
                        ignored_columns = ignored_columns.union(diff)

                    else:
                        LOGGER.info('Stream `%s`: Running discovery ... ',
                                    tap_stream_id)

                        # run discovery for the current table only
                        new_catalog_entry = discover_catalog(
                            mysql_conn, config.get('filter_dbs'),
                            catalog_entry.table).streams[0]

                        selected = {
                            k
                            for k, v in
                            new_catalog_entry.schema.properties.items()
                            if common.property_is_selected(
                                new_catalog_entry, k)
                        }

                        # the new catalog has "stream" property = table name, we need to update that to make it the
                        # same as the result of the "resolve_catalog" function
                        new_catalog_entry.stream = tap_stream_id

                        # These are the columns we need to select
                        new_columns = desired_columns(selected,
                                                      new_catalog_entry.schema)

                        cols = set(new_catalog_entry.schema.properties.keys())

                        # drop unsupported properties from schema
                        for col in cols:
                            if col not in new_columns:
                                new_catalog_entry.schema.properties.pop(
                                    col, None)

                        # Add the _sdc_deleted_at col
                        new_columns = add_automatic_properties(
                            new_catalog_entry, list(new_columns))

                        # send the new scheme to target if we have a new schema
                        if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                            write_schema_message(
                                catalog_entry=new_catalog_entry)
                            catalog_entry = new_catalog_entry

                            # update this dictionary while we're at it
                            binlog_streams_map[tap_stream_id][
                                'catalog_entry'] = new_catalog_entry
                            binlog_streams_map[tap_stream_id][
                                'desired_columns'] = new_columns
                            columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    processed_rows_events = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    processed_rows_events = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    processed_rows_events = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update singer bookmark and send STATE message periodically
        if ((processed_rows_events
             and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    LOGGER.info('Processed %s rows', processed_rows_events)

    # Update singer bookmark at the last time to point it the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos,
                                 gtid_pos)
Ejemplo n.º 9
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)