def resolve_catalog(discovered_catalog, streams_to_sync):
    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams_to_sync:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get("replication-key")

        discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id)
        database_name = common.get_database_name(catalog_entry)

        if not discovered_table:
            LOGGER.warning(
                "Database %s table %s was selected but does not exist",
                database_name,
                catalog_entry.table,
            )
            continue

        selected = {
            k
            for k, v in catalog_entry.schema.properties.items()
            if common.property_is_selected(catalog_entry, k) or k == replication_key
        }

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)

        result.streams.append(
            CatalogEntry(
                tap_stream_id=catalog_entry.tap_stream_id,
                metadata=catalog_entry.metadata,
                stream=catalog_entry.tap_stream_id,
                table=catalog_entry.table,
                schema=Schema(
                    type="object",
                    properties={
                        col: discovered_table.schema.properties[col] for col in columns
                    },
                ),
            )
        )

    return result
Esempio n. 2
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                # if a column no longer exists, the event will have something like __dropped_col_XY__
                # to refer to this column, we don't want these columns to be included in the difference
                diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True,
                                  get_db_column_types(binlog_event).keys())).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    LOGGER.debug('Difference between event and schema: %s',
                                 diff)
                    LOGGER.info('Running discovery ... ')

                    # run discovery for the current table only
                    new_catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    selected = {
                        k
                        for k, v in
                        new_catalog_entry.schema.properties.items()
                        if common.property_is_selected(new_catalog_entry, k)
                    }

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    new_catalog_entry.stream = tap_stream_id

                    # These are the columns we need to select
                    new_columns = desired_columns(selected,
                                                  new_catalog_entry.schema)

                    cols = set(new_catalog_entry.schema.properties.keys())

                    # drop unsupported properties from schema
                    for col in cols:
                        if col not in new_columns:
                            new_catalog_entry.schema.properties.pop(col, None)

                    # Add the _sdc_deleted_at col
                    new_columns = add_automatic_properties(
                        new_catalog_entry, list(new_columns))

                    # send the new scheme to target if we have a new schema
                    if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                        write_schema_message(catalog_entry=new_catalog_entry)
                        catalog_entry = new_catalog_entry

                        # update this dictionary while we're at it
                        binlog_streams_map[tap_stream_id][
                            'catalog_entry'] = new_catalog_entry
                        binlog_streams_map[tap_stream_id][
                            'desired_columns'] = new_columns
                        columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Esempio n. 3
0
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader,
                     binlog_streams_map: Dict, state: Dict, config: Dict,
                     end_log_file: str, end_log_pos: int):

    processed_rows_events = 0
    events_skipped = 0

    log_file = None
    log_pos = None
    gtid_pos = reader.auto_position  # initial gtid, we set this when we created the reader's instance

    # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types.
    # Saving them here to avoid doing the check if we should ignore a column over and over again
    ignored_columns = set()

    # Exit from the loop when the reader either runs out of streams to return or we reach
    # the end position (which is Master's)
    for binlog_event in reader:

        # get reader current binlog file and position
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if (log_file > end_log_file) or (end_log_file == log_file
                                         and log_pos >= end_log_pos):
            LOGGER.info(
                'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!',
                log_file, log_pos)

            # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master
            # binlog file and position above, making the latter behind the stream reader and it causes some data loss
            # in the next run by skipping everything between end_log_file and log_pos
            # so we need to update log_pos back to master's position
            log_file = end_log_file
            log_pos = end_log_pos

            break

        if isinstance(binlog_event, RotateEvent):
            LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d',
                         binlog_event.next_binlog, binlog_event.position)

            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position, gtid_pos)

        elif isinstance(binlog_event, MariadbGtidEvent) or isinstance(
                binlog_event, GtidEvent):
            gtid_pos = binlog_event.gtid

            LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__,
                         gtid_pos)

            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)

            # There is strange behavior happening when using GTID in the pymysqlreplication lib,
            # explained here: https://github.com/noplay/python-mysql-replication/issues/367
            # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart
            # consuming binlog from old GTID pos when connection to server is lost.
            reader.auto_position = gtid_pos

        else:
            time_extracted = utils.now()

            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped += 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, processed_rows_events)
            else:
                # Compare event's columns to the schema properties
                diff = __get_diff_in_columns_list(
                    binlog_event, catalog_entry.schema.properties.keys(),
                    ignored_columns)

                # If there are additional cols in the event then run discovery if needed and update the catalog
                if diff:

                    LOGGER.info(
                        'Stream `%s`: Difference detected between event and schema: %s',
                        tap_stream_id, diff)

                    md_map = metadata.to_map(catalog_entry.metadata)

                    if not should_run_discovery(diff, md_map):
                        LOGGER.info(
                            'Stream `%s`: Not running discovery. Ignoring all detected columns in %s',
                            tap_stream_id, diff)
                        ignored_columns = ignored_columns.union(diff)

                    else:
                        LOGGER.info('Stream `%s`: Running discovery ... ',
                                    tap_stream_id)

                        # run discovery for the current table only
                        new_catalog_entry = discover_catalog(
                            mysql_conn, config.get('filter_dbs'),
                            catalog_entry.table).streams[0]

                        selected = {
                            k
                            for k, v in
                            new_catalog_entry.schema.properties.items()
                            if common.property_is_selected(
                                new_catalog_entry, k)
                        }

                        # the new catalog has "stream" property = table name, we need to update that to make it the
                        # same as the result of the "resolve_catalog" function
                        new_catalog_entry.stream = tap_stream_id

                        # These are the columns we need to select
                        new_columns = desired_columns(selected,
                                                      new_catalog_entry.schema)

                        cols = set(new_catalog_entry.schema.properties.keys())

                        # drop unsupported properties from schema
                        for col in cols:
                            if col not in new_columns:
                                new_catalog_entry.schema.properties.pop(
                                    col, None)

                        # Add the _sdc_deleted_at col
                        new_columns = add_automatic_properties(
                            new_catalog_entry, list(new_columns))

                        # send the new scheme to target if we have a new schema
                        if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                            write_schema_message(
                                catalog_entry=new_catalog_entry)
                            catalog_entry = new_catalog_entry

                            # update this dictionary while we're at it
                            binlog_streams_map[tap_stream_id][
                                'catalog_entry'] = new_catalog_entry
                            binlog_streams_map[tap_stream_id][
                                'desired_columns'] = new_columns
                            columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    processed_rows_events = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    processed_rows_events = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    processed_rows_events = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update singer bookmark and send STATE message periodically
        if ((processed_rows_events
             and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    LOGGER.info('Processed %s rows', processed_rows_events)

    # Update singer bookmark at the last time to point it the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos,
                                 gtid_pos)