def test_kinesis_consumer_at_timestamp(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage, with AT_TIMESTAMP option. We do so by: - 1. Publishing data to a test stream - 2. Wait some time and store current timestamp - 3. Publishing new data - 4. Using Kinesis client to attempt reading from stored timestamp, passing it to the AT_TIMESTAMP option - 5. Assert that only the newest data has been read The pipelines look like: Kinesis Consumer pipeline: kinesis_consumer >> trash """ # build stream application_name = get_random_string() stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string()}' client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') # 1. Publish data to the stream put_records = [{'Data': f'First Message {i}', 'PartitionKey': '111'} for i in range(10)] client.put_records(Records=put_records, StreamName=stream_name) # 2. Wait and store timestamp time.sleep(10) timestamp = int(time.time())*1000 # 3. Publish new data put_records = [{'Data': f'Second Message {i}', 'PartitionKey': '111'} for i in range(10)] client.put_records(Records=put_records, StreamName=stream_name) # 4. Build consumer pipeline using timestamp builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='AT_TIMESTAMP', initial_timestamp=timestamp, stream_name=stream_name) trash = builder.add_stage('Trash') kinesis_consumer >> trash consumer_origin_pipeline = builder.build(title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) # 5. messages are published, read through the pipeline and assert snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline, start_pipeline=True, batches=1).snapshot sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [record.field for record in snapshot[kinesis_consumer.instance_name].output] assert all('Second' in str(output_record) for output_record in output_records) finally: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_kinesis_producer(sdc_builder, sdc_executor, aws): """Test for Kinesis producer target stage. We do so by publishing data to a test stream using Kinesis producer stage. Then we stop the pipeline and then read the data from that stream using Kinesis client. We assert the data from the client to what has been ingested by the producer pipeline. Then we add more data, stop the pipelina and we assert the second batch data was readed. The pipeline looks like: Kinesis Producer pipeline: dev_raw_data_source >> kinesis_producer """ # build producer pipeline stream_name = '{}_{}'.format(aws.kinesis_stream_prefix, get_random_string(string.ascii_letters, 10)) raw_str = 'Hello World!' # Create Kinesis stream and capture the ShardId client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') desc_response = client.describe_stream(StreamName=stream_name) shard_id = desc_response['StreamDescription']['Shards'][0]['ShardId'] producer_dest_pipeline = get_kinesis_producer_pipeline( sdc_builder, aws, stream_name, raw_str) # add pipeline and capture pipeline messages to assert sdc_executor.add_pipeline(producer_dest_pipeline) sdc_executor.start_pipeline( producer_dest_pipeline).wait_for_pipeline_batch_count(10) sdc_executor.stop_pipeline(producer_dest_pipeline) history = sdc_executor.get_pipeline_history(producer_dest_pipeline) msgs_sent_count = history.latest.metrics.counter( 'pipeline.batchOutputRecords.counter').count logger.debug('Number of messages ingested into the pipeline = %s', msgs_sent_count) # read data from Kinesis to assert it is what got ingested into the pipeline shard_iterator = client.get_shard_iterator( StreamName=stream_name, ShardId=shard_id, ShardIteratorType='TRIM_HORIZON') response = client.get_records( ShardIterator=shard_iterator['ShardIterator']) msgs_received = [ response['Records'][i]['Data'].decode().strip() for i in range(msgs_sent_count) ] logger.debug('Number of messages received from Kinesis = %d', (len(msgs_received))) assert msgs_received == [raw_str] * msgs_sent_count finally: _ensure_pipeline_is_stopped(sdc_executor, producer_dest_pipeline) logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name)
def test_multiple_batches(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage. We do so by publishing data to a test stream using Kinesis client and having a pipeline which reads that data using Kinesis consumer origin stage. Data is then asserted for what is published at Kinesis client and what we read in the pipeline. Batch size is configured to deal with more than one batch. The pipeline looks like: Kinesis Consumer pipeline: kinesis_consumer >> wiretap """ # build consumer pipeline application_name = get_random_string(string.ascii_letters, 10) stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string(string.ascii_lowercase)}' builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='TRIM_HORIZON', stream_name=stream_name, max_batch_size_in_messages=50) wiretap = builder.add_wiretap() kinesis_consumer >> wiretap.destination consumer_origin_pipeline = builder.build(title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) # run pipeline client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') expected_messages = set('Message {0}'.format(i) for i in range(100)) # not using PartitionKey logic and hence assign some temp key put_records = [{'Data': exp_msg, 'PartitionKey': '111'} for exp_msg in expected_messages] client.put_records(Records=put_records, StreamName=stream_name) # messages are published, read through the pipeline and assert sdc_executor.start_pipeline(consumer_origin_pipeline).wait_for_pipeline_output_records_count(102) sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [record.field['text'].value for record in wiretap.output_records] assert set(output_records) == expected_messages finally: _ensure_pipeline_is_stopped(sdc_executor, consumer_origin_pipeline) logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_kinesis_write_to_error(sdc_builder, sdc_executor, aws): """Test error record handling to a Kinesis stream. We use a dev raw data source to generate record which are directly sent to error through an error destination. Then we use a Kinesis client to consume messages from the stream and verify that all the record errors generated by the pipeline reached the stream. Pipeline: dev_raw_data_source >> error_target """ stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string(string.ascii_letters, 10)}' raw_str = 'Hello World!' # Build pipeline builder = sdc_builder.get_pipeline_builder() err_stage = builder.add_error_stage('Write to Kinesis') err_stage.set_attributes(stream_name=stream_name) dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_str) error_target = builder.add_stage('To Error') dev_raw_data_source >> error_target pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(pipeline) try: # Create Kinesis stream. logger.debug('Creating %s Kinesis stream on AWS...', stream_name) aws.kinesis.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') shard_id = aws.kinesis.describe_stream(StreamName=stream_name)['StreamDescription']['Shards'][0]['ShardId'] # Run pipeline and get error metrics. sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) sdc_executor.stop_pipeline(pipeline) history = sdc_executor.get_pipeline_history(pipeline) msg_count = history.latest.metrics.counter('pipeline.batchErrorRecords.counter').count logger.debug('Number of records sent to error = %s.', msg_count) # Read data from Kinesis stream and compare with the records sent to error. We check that Kinesis # messages and error records match, comparing number of items and data (looking for ocurrences of # stage name and input string). response = aws.kinesis.get_shard_iterator(StreamName=stream_name, ShardId=shard_id, ShardIteratorType='TRIM_HORIZON') response = aws.kinesis.get_records(ShardIterator=response['ShardIterator']) assert len(response['Records']) == msg_count assert all([error_target.instance_name.encode() in rec['Data'] for rec in response['Records']]) assert all([raw_str.encode() in rec['Data'] for rec in response['Records']]) finally: _ensure_pipeline_is_stopped(sdc_executor, pipeline) logger.debug('Deleting Kinesis stream %s...', stream_name) aws.kinesis.delete_stream(StreamName=stream_name)
def test_kinesis_consumer_other_region(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage using other as region and service endpoint. We do so by publishing data to a test stream using Kinesis client and having a pipeline which reads that data using Kinesis consumer origin stage. The region is set to other, and the service endpoint for kinesis is used. Data is then asserted for what is published at Kinesis client and what we read in the pipeline. The pipeline looks like: Kinesis Consumer pipeline: kinesis_consumer >> wiretap """ endpoint = SERVICE_ENDPOINT_FORMAT.format('kinesis', aws.region) # build consumer pipeline application_name = get_random_string(string.ascii_letters, 10) stream_name = '{}_{}'.format(aws.kinesis_stream_prefix, get_random_string(string.ascii_letters, 10)) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='TRIM_HORIZON', stream_name=stream_name) wiretap = builder.add_wiretap() kinesis_consumer >> wiretap.destination consumer_origin_pipeline = builder.build().configure_for_environment(aws) kinesis_consumer.set_attributes(region='OTHER', endpoint=endpoint) sdc_executor.add_pipeline(consumer_origin_pipeline) client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') expected_messages = set('Message {0}'.format(i) for i in range(10)) # not using PartitionKey logic and hence assign some temp key put_records = [{'Data': exp_msg, 'PartitionKey': '111'} for exp_msg in expected_messages] client.put_records(Records=put_records, StreamName=stream_name) # messages are published, read through the pipeline and assert sdc_executor.start_pipeline(consumer_origin_pipeline).wait_for_pipeline_output_records_count(11) sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [record.field['text'].value for record in wiretap.output_records] assert set(output_records) == expected_messages finally: _ensure_pipeline_is_stopped(sdc_executor, consumer_origin_pipeline) logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_kinesis_consumer(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage. We do so by publishing data to a test stream using Kinesis client and having a pipeline which reads that data using Kinesis consumer origin stage. Data is then asserted for what is published at Kinesis client and what we read in the pipeline snapshot. The pipeline looks like: Kinesis Consumer pipeline: kinesis_consumer >> trash """ # build consumer pipeline application_name = get_random_string(string.ascii_letters, 10) stream_name = '{}_{}'.format(aws.kinesis_stream_prefix, get_random_string(string.ascii_letters, 10)) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='TRIM_HORIZON', stream_name=stream_name) trash = builder.add_stage('Trash') kinesis_consumer >> trash consumer_origin_pipeline = builder.build(title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) # run pipeline and capture snapshot client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') expected_messages = set('Message {0}'.format(i) for i in range(10)) # not using PartitionKey logic and hence assign some temp key put_records = [{'Data': exp_msg, 'PartitionKey': '111'} for exp_msg in expected_messages] client.put_records(Records=put_records, StreamName=stream_name) # messages are published, read through the pipeline and assert snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [record.field['text'].value for record in snapshot[kinesis_consumer.instance_name].output] assert set(output_records) == expected_messages finally: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_stream_name(sdc_builder, sdc_executor, aws, test_name, stream_generator, keep_data): """Test for Kinesis producer destination stage. We do so by generating data with dev_raw_data_source and publishing it to a Kinesis stream using Kinesis producer. Data is then asserted for what is produced by dev raw data source and what we read from the stream. The pipeline looks like: dev_raw_data_source >> kinesis_producer """ builder = sdc_builder.get_pipeline_builder() # Create dev_raw_data_source with 10 messages expected_data = [f'Hello {i}' for i in range(10)] source = builder.add_stage('Dev Raw Data Source') source.set_attributes(data_format='TEXT', raw_data='\n'.join(expected_data), stop_after_first_batch=True) # Create Kinesis stream and capture the ShardId client = aws.kinesis try: # Create Kinesis Stream stream_name = f'{aws.kinesis_stream_prefix}_{stream_generator}' # if stream is longer than 128 then select first 128 characters stream_name = stream_name[0:128] logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') desc_response = client.describe_stream(StreamName=stream_name) shard_id = desc_response['StreamDescription']['Shards'][0]['ShardId'] # Create Kinesis Producer kinesis_producer = builder.add_stage('Kinesis Producer') kinesis_producer.set_attributes(data_format='TEXT', stream_name=stream_name, record_separator='', preserve_record_order=True, kinesis_producer_configuration=[{ 'key': 'AggregationEnabled', 'value': 'false' }]) source >> kinesis_producer pipeline = builder.build().configure_for_environment(aws) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() # read data from Kinesis to assert it is what got ingested into the pipeline shard_iterator = client.get_shard_iterator( StreamName=stream_name, ShardId=shard_id, ShardIteratorType='TRIM_HORIZON') response = client.get_records( ShardIterator=shard_iterator['ShardIterator']) msgs_received = [ rec['Data'].decode().strip() for rec in response['Records'] ] logger.debug('Number of messages received from Kinesis = %d', (len(msgs_received))) assert msgs_received == expected_data finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline) if not keep_data: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name)
def test_multiple_batch(sdc_builder, sdc_executor, aws, keep_data): """Test for Kinesis producer destination stage. We do so by generating data with dev_data_generator and publishing it to a Kinesis stream using Kinesis producer. Data is then asserted for what is produced by dev_raw_data_generator using wiretap and what we read from the stream. The pipeline looks like: dev_raw_data_source >> [kinesis_producer, wiretap] """ builder = sdc_builder.get_pipeline_builder() # Create dev_data_generator with 10 messages BATCH_SIZE = 3 BATCHES = 2 origin = builder.add_stage('Dev Data Generator') origin.set_attributes(batch_size=BATCH_SIZE, delay_between_batches=0, fields_to_generate=[{ "type": "CODE_IMEI", "field": "text" }]) # Create Kinesis stream and capture the ShardId client = aws.kinesis try: # Create Kinesis Stream stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string()}' logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') desc_response = client.describe_stream(StreamName=stream_name) shard_id = desc_response['StreamDescription']['Shards'][0]['ShardId'] # Create Kinesis Producer - Aggregation disabled avoids records to be compressed using protobuf # https://github.com/awslabs/amazon-kinesis-producer/issues/80 shows how to configure Kinesis # to avoid compression. kinesis_producer = builder.add_stage('Kinesis Producer') kinesis_producer.set_attributes(data_format='TEXT', binary_field_path='/text', stream_name=stream_name, record_separator='', preserve_record_order=True, kinesis_producer_configuration=[{ 'key': 'AggregationEnabled', 'value': 'false' }]) wiretap = builder.add_wiretap() origin >> [kinesis_producer, wiretap.destination] pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['rateLimit'] = 1 sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', BATCH_SIZE * BATCHES, timeout_sec=120) sdc_executor.stop_pipeline(pipeline) # read data from Kinesis to assert it is what got ingested into the pipeline shard_iterator = client.get_shard_iterator( StreamName=stream_name, ShardId=shard_id, ShardIteratorType='TRIM_HORIZON')["ShardIterator"] out = client.get_records(ShardIterator=shard_iterator) # Records read using kinesis client are retrieved and decoded response = [rec['Data'].decode() for rec in out['Records']] logger.debug('Number of messages received from Kinesis = %d', (len(response))) assert response == [ record.field['text'].value for record in wiretap.output_records ] finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline) if not keep_data: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream(StreamName=stream_name)
def test_kinesis_consumer_additional_properties(sdc_builder, sdc_executor, aws, additional_configurations): """Test for Kinesis consumer origin stage. We do so by publishing data to a test stream using Kinesis client and having a pipeline which reads that data using Kinesis consumer origin stage. Data is then asserted for what is published at Kinesis client and what we read in the pipeline. The pipeline looks like: Kinesis Consumer pipeline: kinesis_consumer >> wiretap """ invalid_config = False # build consumer pipeline application_name = get_random_string(string.ascii_letters, 10) stream_name = '{}_{}'.format(aws.kinesis_stream_prefix, get_random_string(string.ascii_letters, 10)) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes( application_name=application_name, data_format='TEXT', initial_position='TRIM_HORIZON', stream_name=stream_name, kinesis_configuration=additional_configurations) wiretap = builder.add_wiretap() kinesis_consumer >> wiretap.destination consumer_origin_pipeline = builder.build( title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') expected_messages = set('Message {0}'.format(i) for i in range(10)) # not using PartitionKey logic and hence assign some temp key put_records = [{ 'Data': exp_msg, 'PartitionKey': '111' } for exp_msg in expected_messages] client.put_records(Records=put_records, StreamName=stream_name) # messages are published, read through the pipeline and assert sdc_executor.start_pipeline( consumer_origin_pipeline).wait_for_pipeline_output_records_count( 11) sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [ record.field['text'].value for record in wiretap.output_records ] assert set(output_records) == expected_messages except Exception as error: if additional_configurations[0]['key'] == 'a': assert 'KINESIS_24 - Invalid setting for \'' + additional_configurations[0]['key'] + \ '\' property' in error.message invalid_config = True else: raise error finally: _ensure_pipeline_is_stopped(sdc_executor, consumer_origin_pipeline) logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream( StreamName=stream_name ) # Stream operations are done. Delete the stream. if not invalid_config: logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)