Exemple #1
0
def write_schema(schema, row, stream):
    schema_build_start_time = time.time()
    if common.row_to_schema(schema, row):
        singer.write_message(singer.SchemaMessage(
            stream=common.calculate_destination_stream_name(stream),
            schema=schema,
            key_properties=['_id']))
        common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
    common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
Exemple #2
0
def write_schema_message(stream: Dict):
    """
    Creates and writes a stream schema message to stdout
    Args:
        stream: stream catalog
    """
    write_message(SchemaMessage(
        stream=calculate_destination_stream_name(stream),
        schema=stream['schema'],
        key_properties=['_id']))
Exemple #3
0
    def test_calculate_destination_stream_name_with_include_schema_False(self):
        """

        """
        stream = {
            'stream':
            'myStream',
            'metadata': [{
                "breadcrumb": [],
                "metadata": {
                    "database-name": "myDb",
                }
            }]
        }
        common.INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME = False
        self.assertEqual('myStream',
                         common.calculate_destination_stream_name(stream))
Exemple #4
0
    def test_calculate_destination_stream_name_with_include_schema_True(self):
        """

        """
        stream = {
            'stream':
            'myStream',
            'metadata': [{
                "breadcrumb": [],
                "metadata": {
                    "database-name": "myDb",
                }
            }]
        }
        with patch(
                'tap_mongodb.common.INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME'
        ) as constant_mock:
            constant_mock.return_value = True
            self.assertEqual('myDb-myStream',
                             common.calculate_destination_stream_name(stream))
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
Exemple #6
0
def write_schema_message(stream):
    singer.write_message(
        singer.SchemaMessage(
            stream=common.calculate_destination_stream_name(stream),
            schema=stream['schema'],
            key_properties=['_id']))
def sync_collection(
    collection: Collection,
    stream: Dict,
    state: Optional[Dict],
) -> None:
    """
    Syncs the stream records incrementally
    Args:
        collection: MongoDB collection instance
        stream: stream dictionary
        state: state dictionary if exists
    """
    LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {})

    replication_key_name = metadata.to_map(stream['metadata']).get(
        ()).get('replication-key')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}

    if stream_state.get('replication_key_value'):
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gt'] = common.string_to_class(
            stream_state.get('replication_key_value'),
            stream_state.get('replication_key_type'))

    # log query
    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find(find_filter,
                         sort=[(replication_key_name, pymongo.ASCENDING)
                               ]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))
            rows_saved += 1

            update_bookmark(row, state, stream['tap_stream_id'],
                            replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
Exemple #8
0
def sync_collection(client, stream, state, stream_projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting oplog sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')
    collection_name = stream.get("table_name")
    stream_state = state.get('bookmarks', {}).get(tap_stream_id)

    oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'],
                                   stream_state['oplog_ts_inc'])

    # Write activate version message
    version = common.get_stream_version(tap_stream_id, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=version)
    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    start_time = time.time()

    oplog_query = {'ts': {'$gte': oplog_ts}}

    projection = transform_projection(stream_projection)

    oplog_replay = stream_projection is None

    LOGGER.info(
        'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s',
        tap_stream_id, oplog_query, projection, oplog_replay)

    update_buffer = set()

    # consider adding oplog_replay, but this would require removing the projection
    # default behavior is a non_tailable cursor but we might want a tailable one
    # regardless of whether its long lived or not.
    with client.local.oplog.rs.find(oplog_query,
                                    projection,
                                    sort=[('$natural', pymongo.ASCENDING)],
                                    oplog_replay=oplog_replay) as cursor:
        for row in cursor:
            # assertions that mongo is respecing the ts query and sort order
            if row.get('ts') and row.get('ts') < oplog_ts:
                raise common.MongoAssertionException(
                    "Mongo is not honoring the query param")
            if row.get('ts') and row.get('ts') < timestamp.Timestamp(
                    stream_state['oplog_ts_time'],
                    stream_state['oplog_ts_inc']):
                raise common.MongoAssertionException(
                    "Mongo is not honoring the sort ascending param")

            if row.get('ns') != '{}.{}'.format(database_name, collection_name):
                if row.get('ts'):
                    state = update_bookmarks(state, tap_stream_id, row['ts'])
                continue

            row_op = row['op']
            if row_op == 'i':

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            elif row_op == 'u':
                update_buffer.add(row['o2']['_id'])

            elif row_op == 'd':

                # remove update from buffer if that document has been deleted
                if row['o']['_id'] in update_buffer:
                    update_buffer.remove(row['o']['_id'])

                # Delete ops only contain the _id of the row deleted
                row['o'][SDC_DELETED_AT] = row['ts']

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            state = update_bookmarks(state, tap_stream_id, row['ts'])

            # flush buffer if it has filled up
            if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH:
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

            # write state every UPDATE_BOOKMARK_PERIOD messages
            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                # flush buffer before writing state
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

                # write state
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        # flush buffer if finished with oplog
        for buffered_row in flush_buffer(client, update_buffer,
                                         stream_projection, database_name,
                                         collection_name):
            record_message = common.row_to_singer_record(
                stream, buffered_row, version, time_extracted)

            singer.write_message(record_message)
            rows_saved += 1

    common.COUNTS[tap_stream_id] += rows_saved
    common.TIMES[tap_stream_id] += time.time() - start_time
    LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
Exemple #9
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None:
    """
    Sync collection records incrementally
    Args:
        collection: MongoDB collection instance
        stream: dictionary of all stream details
        state: the tap state
    """
    LOGGER.info('Starting full table sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    # pick a new table version if last run wasn't interrupted
    if singer.get_bookmark(state, stream['tap_stream_id'],
                           'last_id_fetched') is not None:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = common.string_to_class(
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'max_id_value'),
            singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type'))
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        find_filter['$gte'] = common.string_to_class(
            last_id_fetched,
            singer.get_bookmark(state, stream['tap_stream_id'],
                                'last_id_fetched_type'))

    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find({'_id': find_filter},
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:
            rows_saved += 1

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    singer.write_bookmark(state, stream['tap_stream_id'],
                          'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])