コード例 #1
0
def flush_buffer(buffer: Dict[str, Set], streams: Dict[str, Dict], database: Database, rows_saved: Dict[str, int]):
    """
    Flush and reset the given buffer, it increments the row_saved count in the given rows_saved dictionary
    Args:
        database: mongoDB DB instance
        buffer: A set of rows to flush per stream
        streams: streams whose rows to flush
        rows_saved: map of streams to number of rows saved, this dictionary needs to be incremented
    Returns:

    """
    # flush all streams buffers
    for stream_id, stream_buffer in buffer.items():

        if stream_buffer:
            stream = streams[stream_id]

            for buffered_row in get_buffer_rows_from_db(database[stream['table_name']],
                                                        stream_buffer):
                record_message = common.row_to_singer_record(stream=stream,
                                                             row=buffered_row,
                                                             time_extracted=utils.now(),
                                                             time_deleted=None)
                singer.write_message(record_message)

                rows_saved[stream_id] += 1

            buffer[stream_id].clear()
コード例 #2
0
    def test_row_to_singer_record_successful_transformation_with_deleted(self):
        stream = {
            'stream': 'myStream',
            'metadata': [{
                'breadcrumb': [],
                'metadata': {}
            }]
        }

        row = {
            '_id': ObjectId('0123456789ab0123456789ab'),
            'key1': 10,
            'key2': Timestamp(1589379991, 4696183),
            'key3': 1.5
        }
        dt = datetime(2020, 5, 13, 14, 10, 10, tzinfo=tzutc())

        result = common.row_to_singer_record(
            stream, row, dt, datetime(2020, 5, 20, 15, 0, 0, 0,
                                      tzinfo=tzutc()), 100)

        self.assertEqual(
            {
                'type': 'RECORD',
                'stream': 'myStream',
                'record': {
                    '_id': '0123456789ab0123456789ab',
                    'document': {
                        '_id': '0123456789ab0123456789ab',
                        'key1': 10,
                        'key2': '2020-05-13T14:26:31.000000Z',
                        'key3': 1.5
                    },
                    common.SDC_DELETED_AT: '2020-05-20T15:00:00.000000Z',
                },
                'version': 100,
                'time_extracted': '2020-05-13T14:10:10.000000Z',
            }, result.asdict())
コード例 #3
0
ファイル: full_table.py プロジェクト: psschroeter/tap-mongodb
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
コード例 #4
0
def sync_collection(
    collection: Collection,
    stream: Dict,
    state: Optional[Dict],
) -> None:
    """
    Syncs the stream records incrementally
    Args:
        collection: MongoDB collection instance
        stream: stream dictionary
        state: state dictionary if exists
    """
    LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {})

    replication_key_name = metadata.to_map(stream['metadata']).get(
        ()).get('replication-key')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}

    if stream_state.get('replication_key_value'):
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gt'] = common.string_to_class(
            stream_state.get('replication_key_value'),
            stream_state.get('replication_key_type'))

    # log query
    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find(find_filter,
                         sort=[(replication_key_name, pymongo.ASCENDING)
                               ]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))
            rows_saved += 1

            update_bookmark(row, state, stream['tap_stream_id'],
                            replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
コード例 #5
0
ファイル: oplog.py プロジェクト: mrtrantuan/tap-mongodb
def sync_collection(client, stream, state, stream_projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting oplog sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')
    collection_name = stream.get("table_name")
    stream_state = state.get('bookmarks', {}).get(tap_stream_id)

    oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'],
                                   stream_state['oplog_ts_inc'])

    # Write activate version message
    version = common.get_stream_version(tap_stream_id, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=version)
    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    start_time = time.time()

    oplog_query = {'ts': {'$gte': oplog_ts}}

    projection = transform_projection(stream_projection)

    oplog_replay = stream_projection is None

    LOGGER.info(
        'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s',
        tap_stream_id, oplog_query, projection, oplog_replay)

    update_buffer = set()

    # consider adding oplog_replay, but this would require removing the projection
    # default behavior is a non_tailable cursor but we might want a tailable one
    # regardless of whether its long lived or not.
    with client.local.oplog.rs.find(oplog_query,
                                    projection,
                                    sort=[('$natural', pymongo.ASCENDING)],
                                    oplog_replay=oplog_replay) as cursor:
        for row in cursor:
            # assertions that mongo is respecing the ts query and sort order
            if row.get('ts') and row.get('ts') < oplog_ts:
                raise common.MongoAssertionException(
                    "Mongo is not honoring the query param")
            if row.get('ts') and row.get('ts') < timestamp.Timestamp(
                    stream_state['oplog_ts_time'],
                    stream_state['oplog_ts_inc']):
                raise common.MongoAssertionException(
                    "Mongo is not honoring the sort ascending param")

            if row.get('ns') != '{}.{}'.format(database_name, collection_name):
                if row.get('ts'):
                    state = update_bookmarks(state, tap_stream_id, row['ts'])
                continue

            row_op = row['op']
            if row_op == 'i':

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            elif row_op == 'u':
                update_buffer.add(row['o2']['_id'])

            elif row_op == 'd':

                # remove update from buffer if that document has been deleted
                if row['o']['_id'] in update_buffer:
                    update_buffer.remove(row['o']['_id'])

                # Delete ops only contain the _id of the row deleted
                row['o'][SDC_DELETED_AT] = row['ts']

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            state = update_bookmarks(state, tap_stream_id, row['ts'])

            # flush buffer if it has filled up
            if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH:
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

            # write state every UPDATE_BOOKMARK_PERIOD messages
            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                # flush buffer before writing state
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

                # write state
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        # flush buffer if finished with oplog
        for buffered_row in flush_buffer(client, update_buffer,
                                         stream_projection, database_name,
                                         collection_name):
            record_message = common.row_to_singer_record(
                stream, buffered_row, version, time_extracted)

            singer.write_message(record_message)
            rows_saved += 1

    common.COUNTS[tap_stream_id] += rows_saved
    common.TIMES[tap_stream_id] += time.time() - start_time
    LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
コード例 #6
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
コード例 #7
0
ファイル: oplog.py プロジェクト: checkr/tap-mongodb
def sync_oplog_stream(client, streams, state):

    streams_map = generate_streams_map(streams)

    #for tap_stream_id in streams_map.keys():
    #    common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    for tap_stream_id, bookmark in state.get('bookmarks', {}).items():
        oplog_ts = min([timestamp.Timestamp(v['oplog_ts_time'], v['oplog_ts_inc'])
                        for k,v in state.get('bookmarks', {}).items()
                        if streams_map.get(k)])

        LOGGER.info("Starting oplog replication with ts=%s", oplog_ts)

        time_extracted = utils.now()

        rows_saved = 0
        ops_skipped = 0

        with client.local.oplog.rs.find({'ts': {'$gt': oplog_ts}},
                                        oplog_replay=True) as cursor:

            while cursor.alive:
                try:
                    row = next(cursor)

                    if row['op'] == 'n':
                        LOGGER.debug('Skipping noop op')
                    elif not streams_map.get(generate_tap_stream_id_for_row(row)):
                        ops_skipped = ops_skipped + 1

                        if ops_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                            LOGGER.info("Skipped %s ops so far as they were not for selected tables; %s rows extracted",
                                        ops_skipped,
                                        rows_saved)
                    else:
                        rows_saved += 1
                        row_op = row['op']
                        if row_op in ['i']:
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]
                            whitelisted_row = {k:v for k,v in row['o'].items() if k not in stream_map_entry['blacklist']}
                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)

                            singer.write_message(record_message)
                        if row_op in ['u']:
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]

                            # if '$set' in row['o'].keys():
                            #     obj = dict(row['o2'], **row['o']['$set'])
                            # else:
                            #     obj = row['o']

                            whitelisted_row = {k:v for k,v in row['o'].items() if k not in stream_map_entry['blacklist']}
                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)

                            singer.write_message(record_message)
                        elif row_op == 'd':
                            tap_stream_id = generate_tap_stream_id_for_row(row)
                            stream_map_entry = streams_map[tap_stream_id]

                            # Delete ops only contain the _id of the row deleted
                            whitelisted_row = []
                            whitelisted_row['_id'] = row['o']['_id']
                            whitelisted_row[SDC_DELETED_AT] = row['ts']

                            record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                        whitelisted_row,
                                                                        common.get_stream_version(tap_stream_id, state),
                                                                        time_extracted)
                            singer.write_message(record_message)
                        else:
                            LOGGER.info("Skipping op for table %s as it is not an INSERT, UPDATE, or DELETE", row['ns'])

                    state = update_bookmarks(state,
                                             streams_map,
                                             row['ts'])
                except InvalidBSON as e:
                    LOGGER.info(e)
                    continue

                if rows_saved % 1000 == 0:
                    singer.write_state(state)

            # Send state message at the end
            singer.write_state(state)
コード例 #8
0
def sync_table(client, stream, state, stream_version, columns):
    common.whitelist_bookmark_keys(generate_bookmark_keys(stream),
                                   stream['tap_stream_id'], state)

    mdata = metadata.to_map(stream['metadata'])
    stream_metadata = mdata.get(())

    database_name = stream_metadata['database-name']

    db = client[database_name]
    collection = db[stream['stream']]

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['stream'], version=stream_version)

    initial_full_table_complete = singer.get_bookmark(
        state, stream['tap_stream_id'], 'initial_full_table_complete')

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete:
        singer.write_message(activate_version_message)

    max_id_value = singer.get_bookmark(
        state, stream['tap_stream_id'],
        'max_id_value') or get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'max_id_value', max_id_value)

    find_filter = {'$lte': objectid.ObjectId(max_id_value)}

    if last_id_fetched:
        find_filter['$gt':objectid.ObjectId(last_id_fetched)]

    LOGGER.info("Starting full table replication for table {}.{}".format(
        database_name, stream['stream']))

    with metrics.record_counter(None) as counter:
        with collection.find({'_id': find_filter},
                             sort=[("_id", pymongo.DESCENDING)]) as cursor:
            rows_saved = 0

            time_extracted = utils.now()

            for row in cursor:
                rows_saved += 1

                whitelisted_row = {
                    k: v
                    for k, v in row.items() if k in columns
                }
                record_message = common.row_to_singer_record(
                    stream, whitelisted_row, stream_version, time_extracted)

                singer.write_message(record_message)

                state = singer.write_bookmark(state, stream['tap_stream_id'],
                                              'last_id_fetched',
                                              str(row['_id']))

                if rows_saved % 1000 == 0:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')

    singer.write_message(activate_version_message)
コード例 #9
0
def sync_oplog_stream(client, streams, state):
    streams_map = generate_streams_map(streams)

    for tap_stream_id in streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    for tap_stream_id, bookmark in state.get('bookmarks', {}).items():
        columns = streams_map.get(tap_stream_id)

        if not columns:
            continue

        oplog_ts = min([timestamp.Timestamp(v['oplog_ts_time'], v['oplog_ts_inc'])
                        for k,v in state.get('bookmarks', {}).items()
                        if streams_map.get(k)])

        LOGGER.info("Starting oplog replication with ts=%s", oplog_ts)

        time_extracted = utils.now()

        rows_saved = 0
        ops_skipped = 0

        with client.local.oplog.rs.find({'ts': {'$gt': oplog_ts}},
                                        oplog_replay=True) as cursor:
            for row in cursor:
                if row['op'] == 'n':
                    LOGGER.info('Skipping noop op')
                elif not streams_map.get(generate_tap_stream_id_for_row(row)):
                    ops_skipped = ops_skipped + 1

                    if ops_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                        LOGGER.info("Skipped %s ops so far as they were not for selected tables; %s rows extracted",
                                    ops_skipped,
                                    rows_saved)
                else:
                    row_op = row['op']
                    if row_op in ['i', 'u']:
                        tap_stream_id = generate_tap_stream_id_for_row(row)
                        stream_map_entry = streams_map[tap_stream_id]
                        whitelisted_row = {k:v for k,v in row['o'].items() if k in stream_map_entry['columns']}
                        record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                     whitelisted_row,
                                                                     common.get_stream_version(tap_stream_id, state),
                                                                     time_extracted)

                        singer.write_message(record_message)

                    elif row_op == 'd':
                        tap_stream_id = generate_tap_stream_id_for_row(row)
                        stream_map_entry = streams_map[tap_stream_id]

                        # Delete ops only contain the _id of the row deleted
                        whitelisted_row = {column_name:None for column_name in stream_map_entry['columns']}

                        whitelisted_row['_id'] = row['o']['_id']
                        whitelisted_row[SDC_DELETED_AT] = row['ts']

                        record_message = common.row_to_singer_record(stream_map_entry['stream'],
                                                                     whitelisted_row,
                                                                     common.get_stream_version(tap_stream_id, state),
                                                                     time_extracted)
                        singer.write_message(record_message)
                    else:
                        LOGGER.info("Skipping op for table %s as it is not an INSERT, UPDATE, or DELETE", row['ns'])

                state = update_bookmarks(state,
                                         streams_map,
                                         row['ts'])
コード例 #10
0
def sync_database(database: Database,
                  streams_to_sync: Dict[str, Dict],
                  state: Dict,
                  update_buffer_size: int,
                  await_time_ms: int
                  ) -> None:
    """
    Syncs the records from the given collection using ChangeStreams
    Args:
        database: MongoDB Database instance to sync
        streams_to_sync: Dict of stream dictionary with all the stream details
        state: state dictionary
        update_buffer_size: the size of buffer used to hold detected updates
        await_time_ms:  the maximum time in milliseconds for the log based to wait for changes before exiting
    """
    LOGGER.info('Starting LogBased sync for streams "%s" in database "%s"', list(streams_to_sync.keys()), database.name)

    rows_saved = {}
    start_time = time.time()
    update_buffer = {}

    for stream_id in streams_to_sync:
        update_buffer[stream_id] = set()
        rows_saved[stream_id] = 0

    stream_ids = set(streams_to_sync.keys())

    # Init a cursor to listen for changes from the last saved resume token
    # if there are no changes after MAX_AWAIT_TIME_MS, then we'll exit
    with database.watch(
            [{'$match': {
                '$or': [
                    {'operationType': 'insert'}, {'operationType': 'update'}, {'operationType': 'delete'}
                ],
                '$and': [
                    # watch collections of selected streams
                    {'ns.coll': {'$in': [val['table_name'] for val in streams_to_sync.values()]}}
                ]
            }}],
            max_await_time_ms=await_time_ms,
            start_after=get_token_from_state(stream_ids, state)
    ) as cursor:
        while cursor.alive:

            change = cursor.try_next()

            # Note that the ChangeStream's resume token may be updated
            # even when no changes are returned.

            # Token can look like in MongoDB 4.2:
            #       {'_data': 'A_LONG_HEX_DECIMAL_STRING'}
            #    or {'_data': 'A_LONG_HEX_DECIMAL_STRING', '_typeBits': b'SOME_HEX'}

            # Get the '_data' only from resume token
            # token can contain a property '_typeBits' of type bytes which cannot be json
            # serialized when creating the state.
            # '_data' is enough to resume LOG_BASED
            resume_token = {
                '_data': cursor.resume_token['_data']
            }

            # After MAX_AWAIT_TIME_MS has elapsed, the cursor will return None.
            # write state and exit
            if change is None:
                LOGGER.info('No change streams after %s, updating bookmark and exiting...', await_time_ms)

                state = update_bookmarks(state, stream_ids, resume_token)
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

                break

            tap_stream_id = f'{change["ns"]["db"]}-{change["ns"]["coll"]}'

            operation = change['operationType']

            if operation == 'insert':
                singer.write_message(common.row_to_singer_record(stream=streams_to_sync[tap_stream_id],
                                                                 row=change['fullDocument'],
                                                                 time_extracted=utils.now(),
                                                                 time_deleted=None))

                rows_saved[tap_stream_id] += 1

            elif operation == 'update':
                # update operation only return _id and updated fields in the row,
                # so saving _id for now until we fetch the document when it's time to flush the buffer
                update_buffer[tap_stream_id].add(change['documentKey']['_id'])

            elif operation == 'delete':
                # remove update from buffer if that document has been deleted
                update_buffer[tap_stream_id].discard(change['documentKey']['_id'])

                # Delete ops only contain the _id of the row deleted
                singer.write_message(common.row_to_singer_record(
                    stream=streams_to_sync[tap_stream_id],
                    row={'_id': change['documentKey']['_id']},
                    time_extracted=utils.now(),
                    time_deleted=change[
                        'clusterTime'].as_datetime()))  # returns python's datetime.datetime instance in UTC

                rows_saved[tap_stream_id] += 1

            # update the states of all streams
            state = update_bookmarks(state, stream_ids, resume_token)

            # flush buffer if it has filled up or flush and write state every UPDATE_BOOKMARK_PERIOD messages
            if sum(len(stream_buffer) for stream_buffer in update_buffer.values()) >= update_buffer_size or \
                    sum(rows_saved.values()) % common.UPDATE_BOOKMARK_PERIOD == 0:

                LOGGER.debug('Flushing update buffer ...')

                flush_buffer(update_buffer, streams_to_sync, database, rows_saved)

                if sum(rows_saved.values()) % common.UPDATE_BOOKMARK_PERIOD == 0:
                    # write state
                    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # flush buffer if finished with changeStreams
    flush_buffer(update_buffer, streams_to_sync, database, rows_saved)

    for stream_id in stream_ids:
        common.COUNTS[stream_id] += rows_saved[stream_id]
        common.TIMES[stream_id] += time.time() - start_time
        LOGGER.info('Syncd %s records for %s', rows_saved[stream_id], stream_id)
コード例 #11
0
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None:
    """
    Sync collection records incrementally
    Args:
        collection: MongoDB collection instance
        stream: dictionary of all stream details
        state: the tap state
    """
    LOGGER.info('Starting full table sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    # pick a new table version if last run wasn't interrupted
    if singer.get_bookmark(state, stream['tap_stream_id'],
                           'last_id_fetched') is not None:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = common.string_to_class(
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'max_id_value'),
            singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type'))
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        find_filter['$gte'] = common.string_to_class(
            last_id_fetched,
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'last_id_fetched_type'))

    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find({'_id': find_filter},
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:
            rows_saved += 1

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    singer.write_bookmark(state, stream['tap_stream_id'],
                          'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])