Ejemplo n.º 1
0
def do_sync_full_table(mysql_conn, config, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication",
                catalog_entry.stream)
    key_properties = common.get_key_properties(catalog_entry)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)

    full_table.sync_table(mysql_conn, catalog_entry, state, columns,
                          stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version')

    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'initial_full_table_complete', True)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 2
0
def handle_write_rows_event(event, catalog_entry, state, columns, rows_saved,
                            time_extracted):
    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    db_column_types = get_db_column_types(event)

    for row in event.rows:
        filtered_vals = {
            k: v
            for k, v in row['values'].items() if k in columns
        }

        record_message = row_to_singer_record(catalog_entry, stream_version,
                                              db_column_types, filtered_vals,
                                              time_extracted)

        singer.write_message(record_message)
        rows_saved = rows_saved + 1

    return rows_saved
Ejemplo n.º 3
0
def handle_delete_rows_event(event, catalog_entry, state, columns, rows_saved,
                             time_extracted):
    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    db_column_types = get_db_column_types(event)

    for row in event.rows:
        event_ts = datetime.datetime.utcfromtimestamp(
            event.timestamp).replace(tzinfo=pytz.UTC)
        vals = row['values']

        vals[SDC_DELETED_AT] = event_ts

        filtered_vals = {k: v for k, v in vals.items() if k in columns}

        record_message = row_to_singer_record(catalog_entry, stream_version,
                                              db_column_types, filtered_vals,
                                              time_extracted)

        singer.write_message(record_message)

        rows_saved = rows_saved + 1

    return rows_saved
Ejemplo n.º 4
0
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str):
    binlog.verify_binlog_config(mysql_conn)

    if use_gtid and engine == MYSQL_ENGINE:
        binlog.verify_gtid_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)

    if is_view:
        raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.")

    log_file = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state,
                                  catalog_entry.tap_stream_id,
                                  'log_pos')

    gtid = None
    if use_gtid:
        gtid = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'gtid')

    max_pk_values = singer.get_bookmark(state,
                                        catalog_entry.tap_stream_id,
                                        'max_pk_values')

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)):
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
    else:
        LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'initial_binlog_complete',
                                      False)

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn)

        current_gtid = None
        if use_gtid:
            current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)

            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

        else:
            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)
Ejemplo n.º 5
0
def sync_table(connection, config, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                  'log_pos')

    verify_binlog_config(connection, catalog_entry)
    verify_log_file_exists(connection, catalog_entry, log_file, log_pos)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    server_id = fetch_server_id(connection)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    table_path = (catalog_entry.database, catalog_entry.stream)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0

    for binlog_event in reader:
        if reader.log_file == log_file and reader.log_pos == log_pos:
            LOGGER.info(
                "Skipping event for log_file=%s and log_pos=%s as it was processed last sync",
                reader.log_file, reader.log_pos)
            continue

        if isinstance(binlog_event, RotateEvent):
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', binlog_event.next_binlog)
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', binlog_event.position)

        elif (binlog_event.schema, binlog_event.table) == table_path:
            db_column_types = {c.name: c.type for c in binlog_event.columns}

            if isinstance(binlog_event, WriteRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            elif isinstance(binlog_event, UpdateRowsEvent):
                for row in binlog_event.rows:
                    filtered_vals = {
                        k: v
                        for k, v in row['after_values'].items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1
            elif isinstance(binlog_event, DeleteRowsEvent):
                for row in binlog_event.rows:
                    event_ts = datetime.datetime.utcfromtimestamp(
                        binlog_event.timestamp).replace(tzinfo=pytz.UTC)

                    vals = row['values']
                    vals[SDC_DELETED_AT] = event_ts

                    filtered_vals = {
                        k: v
                        for k, v in vals.items() if k in columns
                    }

                    yield row_to_singer_record(catalog_entry, stream_version,
                                               db_column_types, filtered_vals,
                                               time_extracted)

                    rows_saved = rows_saved + 1

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', reader.log_file)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', reader.log_pos)

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))

    yield singer.StateMessage(value=copy.deepcopy(state))
Ejemplo n.º 6
0
def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns):
    binlog.verify_binlog_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)
    key_properties = common.get_key_properties(catalog_entry)

    if is_view:
        raise Exception("Unable to replicate stream({}) with binlog because it is a view.".format(catalog_entry.stream))

    log_file = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state,
                                  catalog_entry.tap_stream_id,
                                  'log_pos')

    max_pk_values = singer.get_bookmark(state,
                                        catalog_entry.tap_stream_id,
                                        'max_pk_values')

    last_pk_fetched = singer.get_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'last_pk_fetched')

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if log_file and log_pos and max_pk_values:
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    else:
        LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'initial_binlog_complete',
                                      False)

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn)
        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        if full_table.sync_is_resumable(mysql_conn, catalog_entry):
            # We must save log_file and log_pos across FULL_TABLE syncs when performing
            # a resumable full table sync
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
        else:
            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)
Ejemplo n.º 7
0
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns):
    binlog.verify_binlog_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)

    if is_view:
        raise Exception(
            f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view."
        )

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_file")

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_pos")

    max_pk_values = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, "max_pk_values"
    )

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if log_file and log_pos and max_pk_values:
        LOGGER.info(
            "Resuming initial full table sync for LOG_BASED stream %s",
            catalog_entry.tap_stream_id,
        )
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    else:
        LOGGER.info(
            "Performing initial full table sync for LOG_BASED stream %s",
            catalog_entry.tap_stream_id,
        )

        state = singer.write_bookmark(
            state, catalog_entry.tap_stream_id, "initial_binlog_complete", False
        )

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(
            mysql_conn
        )
        state = singer.write_bookmark(
            state, catalog_entry.tap_stream_id, "version", stream_version
        )

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file and log_pos across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_file", current_log_file
            )

            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_pos", current_log_pos
            )

            full_table.sync_table(
                mysql_conn, catalog_entry, state, columns, stream_version
            )

        else:
            full_table.sync_table(
                mysql_conn, catalog_entry, state, columns, stream_version
            )
            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_file", current_log_file
            )

            state = singer.write_bookmark(
                state, catalog_entry.tap_stream_id, "log_pos", current_log_pos
            )
Ejemplo n.º 8
0
def sync_table(mysql_conn, catalog_entry, state, columns, limit=None):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    iterate_limit = True
    while iterate_limit:

        replication_key_metadata = stream_metadata.get('replication-key')
        replication_key_state = singer.get_bookmark(state,
                                                    catalog_entry.tap_stream_id,
                                                    'replication_key')

        replication_key_value = None

        if replication_key_metadata == replication_key_state:
            replication_key_value = singer.get_bookmark(state,
                                                        catalog_entry.tap_stream_id,
                                                        'replication_key_value')
        else:
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'replication_key',
                                          replication_key_metadata)
            state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value')

        stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)
        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        activate_version_message = singer.ActivateVersionMessage(
            stream=catalog_entry.stream,
            version=stream_version
        )

        singer.write_message(activate_version_message)

        with connect_with_backoff(mysql_conn) as open_conn:
            with open_conn.cursor() as cur:
                select_sql = common.generate_select_sql(catalog_entry, columns)
                params = {}

                if replication_key_value is not None:
                    if catalog_entry.schema.properties[replication_key_metadata].format == 'date-time':
                        replication_key_value = pendulum.parse(replication_key_value)

                    select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                        replication_key_metadata,
                        replication_key_metadata)

                    params['replication_key_value'] = replication_key_value
                elif replication_key_metadata is not None:
                    select_sql += ' ORDER BY `{}` ASC'.format(replication_key_metadata)

                if limit:
                    select_sql += ' LIMIT {}'.format(limit)

                num_rows = common.sync_query(cur,
                                             catalog_entry,
                                             state,
                                             select_sql,
                                             columns,
                                             stream_version,
                                             params)
                if limit is None or num_rows < limit:
                    iterate_limit = False
Ejemplo n.º 9
0
def generate_messages(con, config, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = md_map.get((), {}).get('replication-key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                LOGGER.info("Stream %s is using incremental replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [replication_key])

                for message in incremental.sync_table(con, catalog_entry,
                                                      state, columns):
                    yield message
            elif replication_method == 'LOG_BASED':
                if catalog_entry.is_view:
                    raise Exception(
                        "Unable to replicate stream({}) with binlog because it is a view."
                        .format(catalog_entry.stream))

                LOGGER.info("Stream %s is using binlog replication",
                            catalog_entry.stream)

                log_file = singer.get_bookmark(state,
                                               catalog_entry.tap_stream_id,
                                               'log_file')

                log_pos = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'log_pos')

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                if log_file and log_pos:
                    columns = binlog.add_automatic_properties(
                        catalog_entry, columns)

                    for message in binlog.sync_table(con, config,
                                                     catalog_entry, state,
                                                     columns):
                        yield message
                else:
                    LOGGER.info("Performing initial full table sync")

                    log_file, log_pos = binlog.fetch_current_log_file_and_pos(
                        con)

                    stream_version = common.get_stream_version(
                        catalog_entry.tap_stream_id, state)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'version', stream_version)

                    for message in full_table.sync_table(
                            con, catalog_entry, state, columns,
                            stream_version):
                        yield message

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_file', log_file)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_pos', log_pos)

                    yield singer.StateMessage(value=copy.deepcopy(state))
            elif replication_method == 'FULL_TABLE':
                LOGGER.info("Stream %s is using full table replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                stream_version = common.get_stream_version(
                    catalog_entry.tap_stream_id, state)

                for message in full_table.sync_table(con, catalog_entry, state,
                                                     columns, stream_version):
                    yield message

                # Prefer initial_full_table_complete going forward
                singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'version')

                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'initial_full_table_complete',
                                              True)

                yield singer.StateMessage(value=copy.deepcopy(state))
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))