Ejemplo n.º 1
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id, _ in binlog_streams_map.items():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = fetch_server_id(mysql_conn)
        LOGGER.info("No server_id provided, will use global server_id=%s",
                    server_id)

    connection_wrapper = make_connection_wrapper(config)
    reader = None

    try:
        slave_uuid = f"bi-reader-%04x" % random.getrandbits(64)

        reader = BinLogStreamReader(
            connection_settings={},
            server_id=server_id,
            slave_uuid=slave_uuid,
            log_file=log_file,
            log_pos=log_pos,
            resume_stream=True,
            only_events=[
                RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent
            ],
            pymysql_wrapper=connection_wrapper,
        )
        LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                    log_file, log_pos)
        _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config)
    finally:
        # BinLogStreamReader doesn't implement the `with` methods
        # So, try/finally will close the chain from the top
        reader.close()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 2
0
def sync_table(connection, config, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                  'log_pos')

    verify_binlog_config(connection, catalog_entry)
    verify_log_file_exists(connection, catalog_entry, log_file, log_pos)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    server_id = fetch_server_id(connection)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    table_path = (catalog_entry.database, catalog_entry.stream)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0

    for binlog_event in reader:
        if reader.log_file == log_file and reader.log_pos == log_pos:
            LOGGER.info(
                "Skipping event for log_file=%s and log_pos=%s as it was processed last sync",
                reader.log_file, reader.log_pos)
            continue

        if isinstance(binlog_event, RotateEvent):
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', binlog_event.next_binlog)
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', binlog_event.position)

        elif (binlog_event.schema, binlog_event.table) == table_path:
            db_column_types = {c.name: c.type for c in binlog_event.columns}

            if isinstance(binlog_event, WriteRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            elif isinstance(binlog_event, UpdateRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['after_values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1
            elif isinstance(binlog_event, DeleteRowsEvent):
                for row in binlog_event.rows:
                    event_ts = datetime.datetime.utcfromtimestamp(
                        binlog_event.timestamp).replace(tzinfo=pytz.UTC)

                    vals = row['values']
                    vals[SDC_DELETED_AT] = event_ts

                    filtered_vals = {
                        k: v
                        for k, v in vals.items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', reader.log_file)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', reader.log_pos)

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))

    yield singer.StateMessage(value=copy.deepcopy(state))
Ejemplo n.º 3
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id in binlog_streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = fetch_server_id(mysql_conn)
        LOGGER.info("No server_id provided, will use global server_id=%s",
                    server_id)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        state = update_bookmarks(state, binlog_streams_map, reader.log_file,
                                 reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 4
0
def create_binlog_stream_reader(config: Dict, log_file: Optional[str],
                                log_pos: Optional[int],
                                gtid_pos: Optional[str]) -> BinLogStreamReader:
    """
    Create an instance of BinlogStreamReader with the right config

    Args:
        config: dictionary of the content of tap config.json
        log_file: binlog file name to start replication from (Optional if using gtid)
        log_pos: binlog pos to start replication from (Optional if using gtid)
        gtid_pos: GTID pos to start replication from (Optional if using log_file & pos)

    Returns: Instance of BinlogStreamReader
    """
    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = random.randint(
            1, 2 ^ 32)  # generate random server id for this slave
        LOGGER.info("Using randomly generated server_id=%s", server_id)

    engine = config['engine']

    kwargs = {
        'connection_settings': {},
        'pymysql_wrapper': make_connection_wrapper(config),
        'is_mariadb': connection.MARIADB_ENGINE == engine,
        'server_id': server_id,  # slave server ID
        'report_slave': socket.gethostname() or
        'pipelinewise',  # this is so this slave appears in SHOW SLAVE HOSTS;
        'only_events': [WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
    }

    # only fetch events pertaining to the schemas in filter db.
    if config.get('filter_db'):
        kwargs['only_schemas'] = config['filter_db'].split(',')

    if config['use_gtid']:

        if not gtid_pos:
            raise ValueError(
                f'gtid_pos is empty "{gtid_pos}"! Cannot start logical replication from empty gtid.'
            )

        LOGGER.info(
            "Starting logical replication from GTID '%s' on engine '%s'",
            gtid_pos, engine)

        # When using GTID, we want to listen in for GTID events and start from given gtid pos
        kwargs['only_events'].extend([GtidEvent, MariadbGtidEvent])
        kwargs['auto_position'] = gtid_pos

    else:
        if not log_file or not log_pos or log_pos < 0:
            raise ValueError(
                f'log file or pos is empty ("{log_file}", "{log_pos}")! '
                f'Cannot start logical replication from invalid log file/pos.')

        LOGGER.info("Starting logical replication from binlog file ['%s', %d]",
                    log_file, log_pos)

        # When not using GTID, we want to listen in for rotate events, and start from given log position and file
        kwargs['only_events'].append(RotateEvent)
        kwargs['log_file'] = log_file
        kwargs['log_pos'] = log_pos
        kwargs['resume_stream'] = True

    return BinLogStreamReader(**kwargs)