Ejemplo n.º 1
0
def sync_non_oplog_streams(client, streams, state):
    for stream in streams:
        md_map = metadata.to_map(stream['metadata'])
        stream_metadata = md_map.get(())
        select_clause = stream_metadata.get('custom-select-clause')

        if not select_clause:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                stream['tap_stream_stream'])
            continue

        columns = [c.strip(' ') for c in select_clause.split(',')]
        columns.append('_id')

        state = singer.set_currently_syncing(state, stream['tap_stream_id'])

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        replication_method = stream_metadata.get('replication-method')

        database_name = get_database_name(stream)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = stream['table_name']

            if replication_method == 'LOG_BASED':
                do_sync_historical_oplog(client, stream, state, columns)
            elif replication_method == 'FULL_TABLE':
                write_schema_message(stream)
                stream_version = common.get_stream_version(
                    stream['tap_stream_id'], state)
                full_table.sync_table(client, stream, state, stream_version,
                                      columns)

                state = singer.write_bookmark(state, stream['tap_stream_id'],
                                              'initial_full_table_complete',
                                              True)
            else:
                raise Exception(
                    f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})"
                )

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 2
0
def do_sync_historical_oplog(client, stream, state, columns):
    oplog_ts_time = singer.get_bookmark(state, stream['tap_stream_id'],
                                        'oplog_ts_time')

    oplog_ts_inc = singer.get_bookmark(state, stream['tap_stream_id'],
                                       'oplog_ts_inc')

    max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                       'max_id_value')

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    write_schema_message(stream)

    stream_version = common.get_stream_version(stream['tap_stream_id'], state)

    if oplog_ts_time and oplog_ts_inc and max_id_value:
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s",
                    stream['tap_stream_id'])
        full_table.sync_table(client, stream, state, stream_version, columns)

    else:
        LOGGER.info(
            "Performing initial full table sync for LOG_BASED stream %s",
            stream['tap_stream_id'])

        current_oplog_ts = oplog.get_latest_ts(client)

        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'version', stream_version)

        # We must save oplog ts value  across FULL_TABLE syncs
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'oplog_ts_time', current_oplog_ts.time)

        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'oplog_ts_inc', current_oplog_ts.inc)

        full_table.sync_table(client, stream, state, stream_version, columns)
Ejemplo n.º 3
0
def sync_non_oplog_streams(client, streams, state):
    for stream in streams:
        md_map = metadata.to_map(stream['metadata'])
        stream_metadata = md_map.get(())
        blacklisted_fields = stream_metadata.get('blacklisted-fields')
        blacklist = [c.strip(' ') for c in blacklisted_fields.split(',')]
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        replication_method = stream_metadata.get('replication-method')

        database_name = get_database_name(stream)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = stream['table_name']

            if replication_method == 'LOG_BASED':
                do_sync_historical_oplog(client, stream, state, blacklist)
            elif replication_method == 'FULL_TABLE':
                write_schema_message(stream)
                stream_version = common.get_stream_version(
                    stream['tap_stream_id'], state)
                full_table.sync_table(client, stream, state, stream_version,
                                      blacklist)
            else:
                raise Exception(
                    f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})"
                )

            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'initial_full_table_complete', True)

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 4
0
def sync_collection(client, stream, state, stream_projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting oplog sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')
    collection_name = stream.get("table_name")
    stream_state = state.get('bookmarks', {}).get(tap_stream_id)

    oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'],
                                   stream_state['oplog_ts_inc'])

    # Write activate version message
    version = common.get_stream_version(tap_stream_id, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=version)
    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    start_time = time.time()

    oplog_query = {'ts': {'$gte': oplog_ts}}

    projection = transform_projection(stream_projection)

    oplog_replay = stream_projection is None

    LOGGER.info(
        'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s',
        tap_stream_id, oplog_query, projection, oplog_replay)

    update_buffer = set()

    # consider adding oplog_replay, but this would require removing the projection
    # default behavior is a non_tailable cursor but we might want a tailable one
    # regardless of whether its long lived or not.
    with client.local.oplog.rs.find(oplog_query,
                                    projection,
                                    sort=[('$natural', pymongo.ASCENDING)],
                                    oplog_replay=oplog_replay) as cursor:
        for row in cursor:
            # assertions that mongo is respecing the ts query and sort order
            if row.get('ts') and row.get('ts') < oplog_ts:
                raise common.MongoAssertionException(
                    "Mongo is not honoring the query param")
            if row.get('ts') and row.get('ts') < timestamp.Timestamp(
                    stream_state['oplog_ts_time'],
                    stream_state['oplog_ts_inc']):
                raise common.MongoAssertionException(
                    "Mongo is not honoring the sort ascending param")

            if row.get('ns') != '{}.{}'.format(database_name, collection_name):
                if row.get('ts'):
                    state = update_bookmarks(state, tap_stream_id, row['ts'])
                continue

            row_op = row['op']
            if row_op == 'i':

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            elif row_op == 'u':
                update_buffer.add(row['o2']['_id'])

            elif row_op == 'd':

                # remove update from buffer if that document has been deleted
                if row['o']['_id'] in update_buffer:
                    update_buffer.remove(row['o']['_id'])

                # Delete ops only contain the _id of the row deleted
                row['o'][SDC_DELETED_AT] = row['ts']

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            state = update_bookmarks(state, tap_stream_id, row['ts'])

            # flush buffer if it has filled up
            if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH:
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

            # write state every UPDATE_BOOKMARK_PERIOD messages
            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                # flush buffer before writing state
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

                # write state
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        # flush buffer if finished with oplog
        for buffered_row in flush_buffer(client, update_buffer,
                                         stream_projection, database_name,
                                         collection_name):
            record_message = common.row_to_singer_record(
                stream, buffered_row, version, time_extracted)

            singer.write_message(record_message)
            rows_saved += 1

    common.COUNTS[tap_stream_id] += rows_saved
    common.TIMES[tap_stream_id] += time.time() - start_time
    LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
Ejemplo n.º 5
0
def sync_oplog_stream(client, streams, state):

    streams_map = generate_streams_map(streams)

    #for tap_stream_id in streams_map.keys():
    #    common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    for tap_stream_id, bookmark in state.get('bookmarks', {}).items():
        oplog_ts = min([timestamp.Timestamp(v['oplog_ts_time'], v['oplog_ts_inc'])
                        for k,v in state.get('bookmarks', {}).items()
                        if streams_map.get(k)])

        LOGGER.info("Starting oplog replication with ts=%s", oplog_ts)

        time_extracted = utils.now()

        rows_saved = 0
        ops_skipped = 0

        with client.local.oplog.rs.find({'ts': {'$gt': oplog_ts}},
                                        oplog_replay=True) as cursor:

            while cursor.alive:
                try:
                    row = next(cursor)

                    if row['op'] == 'n':
                        LOGGER.debug('Skipping noop op')
                    elif not streams_map.get(generate_tap_stream_id_for_row(row)):
                        ops_skipped = ops_skipped + 1

                        if ops_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                            LOGGER.info("Skipped %s ops so far as they were not for selected tables; %s rows extracted",
                                        ops_skipped,
                                        rows_saved)
                    else:
                        rows_saved += 1
                        row_op = row['op']
                        if row_op in ['i']:
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]
                            whitelisted_row = {k:v for k,v in row['o'].items() if k not in stream_map_entry['blacklist']}
                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)

                            singer.write_message(record_message)
                        if row_op in ['u']:
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]

                            # if '$set' in row['o'].keys():
                            #     obj = dict(row['o2'], **row['o']['$set'])
                            # else:
                            #     obj = row['o']

                            whitelisted_row = {k:v for k,v in row['o'].items() if k not in stream_map_entry['blacklist']}
                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)

                            singer.write_message(record_message)
                        elif row_op == 'd':
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]

                            # Delete ops only contain the _id of the row deleted
                            whitelisted_row = []
                            whitelisted_row['_id'] = row['o']['_id']
                            whitelisted_row[SDC_DELETED_AT] = row['ts']

                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)
                            singer.write_message(record_message)
                        else:
                            LOGGER.info("Skipping op for table %s as it is not an INSERT, UPDATE, or DELETE", row['ns'])

                    state = update_bookmarks(state,
                                             streams_map,
                                             row['ts'])
                except InvalidBSON as e:
                    LOGGER.info(e)
                    continue

                if rows_saved % 1000 == 0:
                    singer.write_state(state)

            # Send state message at the end
            singer.write_state(state)
Ejemplo n.º 6
0
def sync_oplog_stream(client, streams, state):
    streams_map = generate_streams_map(streams)

    for tap_stream_id in streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    for tap_stream_id, bookmark in state.get('bookmarks', {}).items():
        columns = streams_map.get(tap_stream_id)

        if not columns:
            continue

        oplog_ts = min([timestamp.Timestamp(v['oplog_ts_time'], v['oplog_ts_inc'])
                        for k,v in state.get('bookmarks', {}).items()
                        if streams_map.get(k)])

        LOGGER.info("Starting oplog replication with ts=%s", oplog_ts)

        time_extracted = utils.now()

        rows_saved = 0
        ops_skipped = 0

        with client.local.oplog.rs.find({'ts': {'$gt': oplog_ts}},
                                        oplog_replay=True) as cursor:
            for row in cursor:
                if row['op'] == 'n':
                    LOGGER.info('Skipping noop op')
                elif not streams_map.get(generate_tap_stream_id_for_row(row)):
                    ops_skipped = ops_skipped + 1

                    if ops_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                        LOGGER.info("Skipped %s ops so far as they were not for selected tables; %s rows extracted",
                                    ops_skipped,
                                    rows_saved)
                else:
                    row_op = row['op']
                    if row_op in ['i', 'u']:
                        tap_stream_id = generate_tap_stream_id_for_row(row)
                        stream_map_entry = streams_map[tap_stream_id]
                        whitelisted_row = {k:v for k,v in row['o'].items() if k in stream_map_entry['columns']}
                        record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                     whitelisted_row,
                                                                     common.get_stream_version(tap_stream_id, state),
                                                                     time_extracted)

                        singer.write_message(record_message)

                    elif row_op == 'd':
                        tap_stream_id = generate_tap_stream_id_for_row(row)
                        stream_map_entry = streams_map[tap_stream_id]

                        # Delete ops only contain the _id of the row deleted
                        whitelisted_row = {column_name:None for column_name in stream_map_entry['columns']}

                        whitelisted_row['_id'] = row['o']['_id']
                        whitelisted_row[SDC_DELETED_AT] = row['ts']

                        record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                     whitelisted_row,
                                                                     common.get_stream_version(tap_stream_id, state),
                                                                     time_extracted)
                        singer.write_message(record_message)
                    else:
                        LOGGER.info("Skipping op for table %s as it is not an INSERT, UPDATE, or DELETE", row['ns'])

                state = update_bookmarks(state,
                                         streams_map,
                                         row['ts'])
Ejemplo n.º 7
0
    def test_get_stream_version_with_defined_version_returns_the_same_version(
            self):

        state = {'bookmarks': {'myStream': {'version': 123}}}
        self.assertEqual(123, common.get_stream_version('myStream', state))
Ejemplo n.º 8
0
    def test_get_stream_version_with_none_version_returns_new_version(self):

        state = {'bookmarks': {'myStream': {}}}
        self.assertGreaterEqual(time.time() * 1000,
                                common.get_stream_version('myStream', state))