def write_schema(schema, row, stream): schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
def write_schema_message(stream: Dict): """ Creates and writes a stream schema message to stdout Args: stream: stream catalog """ write_message(SchemaMessage( stream=calculate_destination_stream_name(stream), schema=stream['schema'], key_properties=['_id']))
def test_calculate_destination_stream_name_with_include_schema_False(self): """ """ stream = { 'stream': 'myStream', 'metadata': [{ "breadcrumb": [], "metadata": { "database-name": "myDb", } }] } common.INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME = False self.assertEqual('myStream', common.calculate_destination_stream_name(stream))
def test_calculate_destination_stream_name_with_include_schema_True(self): """ """ stream = { 'stream': 'myStream', 'metadata': [{ "breadcrumb": [], "metadata": { "database-name": "myDb", } }] } with patch( 'tap_mongodb.common.INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME' ) as constant_mock: constant_mock.return_value = True self.assertEqual('myDb-myStream', common.calculate_destination_stream_name(stream))
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = stream['schema'] or {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def write_schema_message(stream): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=stream['schema'], key_properties=['_id']))
def sync_collection( collection: Collection, stream: Dict, state: Optional[Dict], ) -> None: """ Syncs the stream records incrementally Args: collection: MongoDB collection instance stream: stream dictionary state: state dictionary if exists """ LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {}) replication_key_name = metadata.to_map(stream['metadata']).get( ()).get('replication-key') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if stream_state.get('replication_key_value'): find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gt'] = common.string_to_class( stream_state.get('replication_key_value'), stream_state.get('replication_key_type')) # log query LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find(find_filter, sort=[(replication_key_name, pymongo.ASCENDING) ]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) rows_saved += 1 update_bookmark(row, state, stream['tap_stream_id'], replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
def sync_collection(client, stream, state, stream_projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting oplog sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') collection_name = stream.get("table_name") stream_state = state.get('bookmarks', {}).get(tap_stream_id) oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'], stream_state['oplog_ts_inc']) # Write activate version message version = common.get_stream_version(tap_stream_id, state) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=version) singer.write_message(activate_version_message) time_extracted = utils.now() rows_saved = 0 start_time = time.time() oplog_query = {'ts': {'$gte': oplog_ts}} projection = transform_projection(stream_projection) oplog_replay = stream_projection is None LOGGER.info( 'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s', tap_stream_id, oplog_query, projection, oplog_replay) update_buffer = set() # consider adding oplog_replay, but this would require removing the projection # default behavior is a non_tailable cursor but we might want a tailable one # regardless of whether its long lived or not. with client.local.oplog.rs.find(oplog_query, projection, sort=[('$natural', pymongo.ASCENDING)], oplog_replay=oplog_replay) as cursor: for row in cursor: # assertions that mongo is respecing the ts query and sort order if row.get('ts') and row.get('ts') < oplog_ts: raise common.MongoAssertionException( "Mongo is not honoring the query param") if row.get('ts') and row.get('ts') < timestamp.Timestamp( stream_state['oplog_ts_time'], stream_state['oplog_ts_inc']): raise common.MongoAssertionException( "Mongo is not honoring the sort ascending param") if row.get('ns') != '{}.{}'.format(database_name, collection_name): if row.get('ts'): state = update_bookmarks(state, tap_stream_id, row['ts']) continue row_op = row['op'] if row_op == 'i': record_message = common.row_to_singer_record( stream, row['o'], version, time_extracted) singer.write_message(record_message) rows_saved += 1 elif row_op == 'u': update_buffer.add(row['o2']['_id']) elif row_op == 'd': # remove update from buffer if that document has been deleted if row['o']['_id'] in update_buffer: update_buffer.remove(row['o']['_id']) # Delete ops only contain the _id of the row deleted row['o'][SDC_DELETED_AT] = row['ts'] record_message = common.row_to_singer_record( stream, row['o'], version, time_extracted) singer.write_message(record_message) rows_saved += 1 state = update_bookmarks(state, tap_stream_id, row['ts']) # flush buffer if it has filled up if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH: for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 update_buffer = set() # write state every UPDATE_BOOKMARK_PERIOD messages if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: # flush buffer before writing state for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 update_buffer = set() # write state singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # flush buffer if finished with oplog for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting incremental sync for %s', tap_stream_id) stream_metadata = metadata.to_map(stream['metadata']).get(()) collection = client[stream_metadata['database-name']][stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None #pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(tap_stream_id, {}) replication_key_name = stream_metadata.get('replication-key') replication_key_value_bookmark = stream_state.get('replication_key_value') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if replication_key_value_bookmark: find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gte'] = \ common.string_to_class(replication_key_value_bookmark, stream_state.get('replication_key_type')) # log query query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) LOGGER.info(query_message) # query collection schema = {"type": "object", "properties": {}} with collection.find(find_filter, projection, sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() for row in cursor: schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[tap_stream_id] += 1 common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time record_message = common.row_to_singer_record(stream, row, stream_version, time_extracted) # gen_schema = common.row_to_schema_message(schema, record_message.record, row) # if DeepDiff(schema, gen_schema, ignore_order=True) != {}: # emit gen_schema # schema = gen_schema singer.write_message(record_message) rows_saved += 1 update_bookmark(row, state, tap_stream_id, replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time()-start_time singer.write_message(activate_version_message) LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None: """ Sync collection records incrementally Args: collection: MongoDB collection instance stream: dictionary of all stream details state: the tap state """ LOGGER.info('Starting full table sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark # pick a new table version if last run wasn't interrupted if singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = common.string_to_class( singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'), singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type')) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: find_filter['$gte'] = common.string_to_class( last_id_fetched, singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type')) LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find({'_id': find_filter}, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: rows_saved += 1 singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])