def test_configurations_data_format_log(sdc_executor, sdc_builder, aws, data_format, log_format): """Check whether S3 origin can parse different log format or not. A log file is being created in s3 bucket mentioned below .S3 origin reads the log file and parse the same. Pipeline for the same- s3_origin >> trash s3_origin >= pipeline_finisher_executor """ if log_format == 'GROK': file_content = data_format_content['APACHE_CUSTOM_LOG_FORMAT'] else: file_content = data_format_content[log_format] client = aws.s3 s3_key = f'{S3_SANDBOX_PREFIX}/{get_random_string()}' attributes = {'bucket': aws.s3_bucket_name, 'prefix_pattern': f'{s3_key}/*', 'read_order': 'LEXICOGRAPHICAL', 'data_format': data_format, 'log_format': log_format, 'custom_log_format': '%h %l %u [%t] "%r" %>s %b', 'regular_expression': REGULAR_EXPRESSION, 'field_path_to_regex_group_mapping': LOG_FIELD_MAPPING } if Version(sdc_builder.version) >= Version('3.7.0'): attributes['number_of_threads'] = 1 pipeline, wiretap = get_aws_origin_to_trash_pipeline(sdc_builder, attributes, aws) try: client.put_object(Bucket=aws.s3_bucket_name, Key=f'{s3_key}/{get_random_string()}.log', Body=file_content) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() assert wiretap.output_records[0].field == get_data_to_verify_output[log_format] finally: # cleaning up s3 bucket delete_aws_objects(client, aws, s3_key)
def test_kafka_origin_batch_max_size(sdc_builder, sdc_executor, cluster): """Check that retrieving messages from Kafka using Kafka Multitopic Consumer respects both the Batch Max Wait Time and the Max Batch Size. Batches are sent when the first of the two conditions is met. This test is checking that the Max Batch Size condition is first met. Kafka Multitopic Consumer Origin pipeline with standalone mode: kafka_multitopic_consumer >> trash """ messages = [f'message{i}' for i in range(1, 21)] expected = [f'message{i}' for i in range(1, 21)] num_batches = 2 kafka_consumer_group = get_random_string(string.ascii_letters, 10) # Build the Kafka consumer pipeline with Standalone mode. builder = sdc_builder.get_pipeline_builder() kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage( builder, cluster) produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages, 'TEXT') if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST' kafka_multitopic_consumer.set_attributes( consumer_group=kafka_consumer_group, max_batch_size_in_records=10, batch_wait_time_in_ms=30000) wiretap = builder.add_wiretap() kafka_multitopic_consumer >> wiretap.destination kafka_consumer_pipeline = builder.build(title='Kafka Multitopic pipeline Maximum batch size threshold') \ .configure_for_environment(cluster) kafka_consumer_pipeline.configuration['shouldRetry'] = False kafka_consumer_pipeline.configuration['executionMode'] = 'STANDALONE' sdc_executor.add_pipeline(kafka_consumer_pipeline) # First test checking Max Batch Size is reached # Publish messages to Kafka and verify using wiretap if the same messages are received. # Start Pipeline. sdc_executor.start_pipeline(kafka_consumer_pipeline) sdc_executor.wait_for_pipeline_metric(kafka_consumer_pipeline, 'input_record_count', num_batches * 10, timeout_sec=60) sdc_executor.stop_pipeline(kafka_consumer_pipeline) assert expected == sorted( [str(record.field['text']) for record in wiretap.output_records])
def test_status_code(sdc_builder, sdc_executor): try: pipeline_builder = sdc_builder.get_pipeline_builder() rest_service = pipeline_builder.add_stage('REST Service') if Version(sdc_builder.version) < Version('3.16.0'): rest_service.application_id = APPLICATION_ID else: rest_service.list_of_application_ids = [{ "credential": APPLICATION_ID }] rest_service.http_listening_port = HTTP_LISTENING_PORT send_response_to_origin = pipeline_builder.add_stage( 'Send Response to Origin') send_response_to_origin.status_code = STATUS_CODE rest_service >> send_response_to_origin pipeline = pipeline_builder.build() sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline) protocol = 'https' if sdc_executor.https else 'http' rest_service_url = f'{protocol}://{sdc_executor.server_host}:{HTTP_LISTENING_PORT}' assert requests.get(rest_service_url, headers={ 'X-SDC-APPLICATION-ID': APPLICATION_ID }).status_code == STATUS_CODE finally: sdc_executor.stop_pipeline(pipeline)
def test_control_hub_api_processor_invalid_credentials(sdc_builder, sdc_executor): """Test Control Hub API Processor. The pipeline would look like: dev_raw_data_source >> control_hub_api_processor >> trash With invalid Control Hub credentials, Control Hub API Processor sends the record to error records list. """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.stop_after_first_batch = True control_hub_api_processor = pipeline_builder.add_stage('Control Hub API') control_hub_api_processor.control_hub_api_url = 'https://cloud.streamsets.com/security/rest/v1/currentUser' control_hub_api_processor.output_field = "/output" control_hub_api_processor.control_hub_user_name = "invalid user" control_hub_api_processor.password = "******" if Version(sdc_builder.version) >= Version('4.0.0'): control_hub_api_processor.authentication_type = 'USER_PASSWORD' wiretap = pipeline_builder.add_wiretap() dev_raw_data_source >> control_hub_api_processor >> wiretap.destination pipeline = pipeline_builder.build('Control Hub API Processor Sample Pipeline') sdc_executor.add_pipeline(pipeline) sdc_executor.validate_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() # Assert Cron Scheduler generated record output assert len(wiretap.output_records) == 0 assert len(wiretap.error_records) == 1
def test_use_security(sdc_builder, sdc_executor, elasticsearch, stage_attributes): """ To test the use security configuration we create a pipeline as follows: Elasticsearch >> Wiretap Since the Elasticsearch server requires using a username with a password an error should happen if the use security property is false. Otherwise we should succeed to find a document we have put before to an index. """ index = get_random_string(string.ascii_lowercase) doc_id = get_random_string(string.ascii_lowercase) builder = sdc_builder.get_pipeline_builder() origin = builder.add_stage('Elasticsearch', type='origin') origin.query = '{"query": {"match_all": {}}}' origin.index = index wiretap = builder.add_wiretap() origin >> wiretap.destination pipeline = builder.build().configure_for_environment(elasticsearch) configured_origin = pipeline.stages.get(label=origin.label) configured_origin.use_security = stage_attributes['use_security'] if not stage_attributes['use_security']: if Version(sdc_builder.version) < Version('3.17.0'): configured_origin.configuration['conf.securityConfig.securityUser'] = f':' else: configured_origin.user_name = '' configured_origin.password = '' sdc_executor.add_pipeline(pipeline) elasticsearch.client.create_document(index=index, id=doc_id, body={"number": 1}) try: if stage_attributes['use_security']: sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == 1 record = wiretap.output_records[0] assert record.field['_id'] == doc_id assert record.field['_index'] == index assert record.field['_source'] == {"number": 1} else: with pytest.raises(ValidationError) as e: sdc_executor.validate_pipeline(pipeline) assert e.value.issues['issueCount'] == 1 assert e.value.issues['stageIssues'][origin.instance_name][0]['message'].find('ELASTICSEARCH_47') != -1 finally: elasticsearch.client.delete_index(index)
def hive_check(cluster, sdc_builder): # based on SDC-13915 if (isinstance(cluster, AmbariCluster) and Version(cluster.version) == Version('3.1') and Version(sdc_builder.version) < Version('3.8.1')): pytest.skip( 'Hive stages not available on HDP 3.1.0.0 for SDC versions before 3.8.1' )
def test_field_decrypt(sdc_builder, sdc_executor, aws): """Basic test to verify Encrypt and Decrypt Fields processor can decrypt a field. An encrypted field is sent and after pipeline is run, verification of decryption is done using wiretap. ciphertext is a byte array, but raw data source provides no way to specify a byte array. Hence a base64 encoded string of the ciphertext is used. Once it has been loaded by the raw data source, it needs to be decoded back into a byte array for input to the encryption processor. The base64 decode processor requires a byte array to decode instead of a string, hence the field type converter. (https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Base64Decoder.html#concept_ujj_spy_kv) The pipeline looks like: dev_raw_data_source >> field_type_converter >> base64_decoder >> field_decrypt >> wiretap """ expected_plaintext = MESSAGE_TEXT.encode() ciphertext, _ = aws.encrypt(expected_plaintext) pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', raw_data=json.dumps({'message': base64.b64encode(ciphertext).decode()}), stop_after_first_batch=True) field_type_converter = pipeline_builder.add_stage('Field Type Converter', type='processor') field_type_converter_configs = [{'fields': ['/message'], 'targetType': 'BYTE_ARRAY'}] field_type_converter.set_attributes(conversion_method='BY_FIELD', field_type_converter_configs=field_type_converter_configs) base64_decoder = pipeline_builder.add_stage('Base64 Field Decoder', type='processor') if Version(sdc_builder.version) < Version("4.4.0"): base64_decoder.set_attributes(field_to_decode='/message', target_field='/message') else: base64_decoder.set_attributes( fields_to_decode=[{'originFieldPath': '/message', 'resultFieldPath': '/message'}] ) field_decrypt = pipeline_builder.add_stage('Encrypt and Decrypt Fields', type='processor') field_decrypt.set_attributes(cipher='ALG_AES_256_GCM_IV12_TAG16_HKDF_SHA384_ECDSA_P384', fields=['/message'], frame_size=4096, mode='DECRYPT') wiretap = pipeline_builder.add_wiretap() dev_raw_data_source >> field_type_converter >> base64_decoder >> field_decrypt >> wiretap.destination pipeline = pipeline_builder.build('Field Decryption Pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() actual_value = wiretap.output_records[0].get_field_data('/message') assert actual_value == expected_plaintext
def test_topic_list(sdc_builder, sdc_executor, cluster): MESSAGE = 'Hello World from SDC & DPM!' EXPECTED = {'text': 'Hello World from SDC & DPM!'} # Build the Kafka consumer pipeline with Standalone mode. builder = sdc_builder.get_pipeline_builder() topic_name = get_random_string() kafka_multitopic_consumer = builder.add_stage( 'Kafka Multitopic Consumer', library=cluster.kafka.standalone_stage_lib) if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.set_attributes(batch_wait_time_in_ms=2000, data_format='TEXT', topic_list=[topic_name]) kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.set_attributes(auto_offset_reset='EARLIEST', batch_wait_time_in_ms=2000, data_format='TEXT', topic_list=[topic_name]) wiretap = builder.add_wiretap() kafka_multitopic_consumer >> wiretap.destination pipeline = builder.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) try: # Publish messages to Kafka and verify using wiretap if the same messages are received. producer = cluster.kafka.producer() producer.send(topic_name, MESSAGE.encode()) # Start Pipeline. sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 1, timeout_sec=120) sdc_executor.stop_pipeline(pipeline) # Verify wiretap data. records = [record.field for record in wiretap.output_records] assert [EXPECTED] == records finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline)
def test_topic_list(sdc_builder, sdc_executor, cluster): MESSAGE = 'Hello World from SDC & DPM!' EXPECTED = {'text': 'Hello World from SDC & DPM!'} # Build the Kafka consumer pipeline with Standalone mode. builder = sdc_builder.get_pipeline_builder() topic_name = get_random_string() kafka_multitopic_consumer = builder.add_stage( 'Kafka Multitopic Consumer', library=cluster.kafka.standalone_stage_lib) if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.set_attributes(batch_wait_time_in_ms=2000, data_format='TEXT', topic_list=[topic_name]) kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.set_attributes(auto_offset_reset='EARLIEST', batch_wait_time_in_ms=2000, data_format='TEXT', topic_list=[topic_name]) trash = builder.add_stage(label='Trash') kafka_multitopic_consumer >> trash pipeline = builder.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) try: # Publish messages to Kafka and verify using snapshot if the same messages are received. producer = cluster.kafka.producer() producer.send(topic_name, MESSAGE.encode()) # Start Pipeline. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot # Verify snapshot data. records = [ record.field for record in snapshot[kafka_multitopic_consumer].output ] assert [EXPECTED] == records finally: sdc_executor.stop_pipeline(pipeline)
def test_keystore_file(sdc_builder, sdc_executor, stage_attributes): """Test "KeyStore path" config parameter. It is tested with two values, one pointing to a real KeyStore file and the other to an unexisting file. We check a TLS_01 error is raised for the unexisting file and that the pipeline successfully transitions to RUNNING state if the file exists. Pipeline: rest_srv >> trash """ builder = sdc_builder.get_pipeline_builder() rest_srv = builder.add_stage('REST Service') if Version('3.16.0') <= Version(sdc_builder.version) < Version('3.17.0'): list_of_application_ids = [{"appId": 'admin'}] rest_srv.set_attributes( list_of_application_ids=list_of_application_ids) elif Version(sdc_builder.version) >= Version('3.17.0'): list_of_application_ids = [{"credential": 'admin'}] rest_srv.set_attributes( list_of_application_ids=list_of_application_ids) else: app_id = 'admin' rest_srv.set_attributes(application_id=app_id) rest_srv.set_attributes(keystore_type=KEYSTORE_TYPE, keystore_password=KEYSTORE_PASSWORD, **stage_attributes) trash = builder.add_stage('Trash') rest_srv >> trash pipeline = builder.build() sdc_executor.add_pipeline(pipeline) if stage_attributes['keystore_file'] == KEYSTORE_FILE_PATH: # Expecting SDC loads the KeyStore and successfully starts to run the pipeline. sdc_executor.start_pipeline(pipeline).wait_for_status(status='RUNNING') sdc_executor.stop_pipeline(pipeline) else: # Expecting a StartError from SDC due to unexisting KeyStore file (TLS_01 error). with pytest.raises(StartError) as e: sdc_executor.start_pipeline(pipeline).wait_for_status( status='RUNNING') assert e.value.message.startswith('TLS_01')
def test_invalid_execution_mode(sdc_executor, pipeline): """Set executionMode to invalid value for a pipeline, try starting it and confirm that it raises expected exception.""" pipeline.configuration['executionMode'] = 'Invalid_Execution_Mode' pipeline.id = 'Invalid_Execution_Mode Pipeline' try: sdc_executor.add_pipeline(pipeline) # Do a version check since execution_mode handling changed starting in the 2.7.0.0 version. if Version(sdc_executor.version) >= Version('2.7.0.0'): with pytest.raises(ValidationError): sdc_executor.dump_log_on_error = False sdc_executor.start_pipeline(pipeline) else: with pytest.raises(sdc_api.StartError): sdc_executor.dump_log_on_error = False sdc_executor.start_pipeline(pipeline) finally: sdc_executor.dump_log_on_error = True
def test_base64_field_encoder(sdc_builder, sdc_executor): """Test Base64 Field Encoder processor. Since this processor accepts a byte array, we use a Field Type Converter processor which will help convert the raw input string to byte array. The pipeline would look like: dev_raw_data_source >> field_type_converter >> base64_field_encoder >> wiretap """ raw_data = 'hello there!' pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True) field_type_converter = pipeline_builder.add_stage('Field Type Converter') field_type_converter.set_attributes(conversion_method='BY_FIELD', field_type_converter_configs=[ {'fields': ['/text'], 'targetType': 'BYTE_ARRAY'} ]) base64_field_encoder = pipeline_builder.add_stage('Base64 Field Encoder', type='processor') if Version(sdc_builder.version) < Version("4.4.0"): base64_field_encoder.set_attributes(field_to_encode='/text', target_field='/result', url_safe=True) else: base64_field_encoder.set_attributes( fields_to_encode=[{'originFieldPath': '/text', 'resultFieldPath': '/result'}], url_safe=True ) wiretap = pipeline_builder.add_wiretap() dev_raw_data_source >> field_type_converter >> base64_field_encoder >> wiretap.destination pipeline = pipeline_builder.build('Base64 Encoder pipeline') sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() result_data = wiretap.output_records[0].field['result'].value # result_data is Base64 encoded by the Base64 encoder stage and for JSON transport it is again encoded, hence # we encode our raw_data twice for assertion assert base64.b64encode(raw_data.encode()) == result_data
def test_control_hub_api_processor(sdc_builder, sdc_executor): """Test Control Hub API Processor. The pipeline would look like: dev_raw_data_source >> control_hub_api_processor >> trash Call Control Hub API "https://cloud.streamsets.com/public-rest/v1/health" using Control Hub API Processor and update field output with the response """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.stop_after_first_batch = True control_hub_api_processor = pipeline_builder.add_stage('Control Hub API') control_hub_api_processor.control_hub_api_url = 'https://cloud.streamsets.com/public-rest/v1/health' control_hub_api_processor.output_field = "/" control_hub_api_processor.control_hub_user_name = "user" control_hub_api_processor.password = "******" if Version(sdc_builder.version) >= Version('4.0.0'): control_hub_api_processor.authentication_type = 'USER_PASSWORD' wiretap = pipeline_builder.add_wiretap() dev_raw_data_source >> control_hub_api_processor >> wiretap.destination pipeline = pipeline_builder.build('Control Hub API Processor Sample Pipeline') sdc_executor.add_pipeline(pipeline) sdc_executor.validate_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() # Assert Cron Scheduler generated record output assert len(wiretap.output_records) == 1 assert len(wiretap.error_records) == 0 assert wiretap.output_records[0].field['alive'].value == True
def test_base64_field_decoder(sdc_builder, sdc_executor): """Test Base64 Field Decoder processor. Since this processor accepts a Base64 encoded byte array, we use intermediate Field Type Converter processor for converting our Base64 string to byte array. The pipeline would look like: dev_raw_data_source >> field_type_converter >> base64_field_decoder >> wiretap """ # input raw_data is a Base64 encoded string normal_string = 'hello there!'.encode() raw_data = base64.b64encode(normal_string).decode() pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True) field_type_converter = pipeline_builder.add_stage('Field Type Converter') field_type_converter.set_attributes(conversion_method='BY_FIELD', field_type_converter_configs=[{'fields': ['/text'], 'targetType': 'BYTE_ARRAY'}]) base64_field_decoder = pipeline_builder.add_stage('Base64 Field Decoder', type='processor') if Version(sdc_builder.version) < Version("4.4.0"): base64_field_decoder.set_attributes(field_to_decode='/text', target_field='/result') else: base64_field_decoder.set_attributes( fields_to_decode=[{'originFieldPath': '/text', 'resultFieldPath': '/result'}] ) wiretap = pipeline_builder.add_wiretap() dev_raw_data_source >> field_type_converter >> base64_field_decoder >> wiretap.destination pipeline = pipeline_builder.build('Base64 Decoder pipeline') sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_finished() result_data = wiretap.output_records[0].field['result'].value # result data is Base64 encoded for JSON transport, hence we can directly compare to our raw Base64 string assert normal_string == result_data
def test_topic(sdc_builder, sdc_executor, cluster, stage_attributes): topic = get_random_string() logger.debug('Kafka topic name: %s', topic) DATA = ['Hello World!' for _ in range(7)] raw_data = '\n'.join(DATA) builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', raw_data=raw_data) kafka_destination = builder.add_stage( 'Kafka Producer', library=cluster.kafka.standalone_stage_lib) if Version(sdc_builder.version) >= Version('3.19'): if 'provide_keytab' in stage_attributes: stage_attributes[ 'provide_keytab_at_runtime'] = stage_attributes.pop( 'provide_keytab') kafka_destination.set_attributes(topic=topic, data_format='TEXT', **stage_attributes) pipeline_finisher = builder.add_stage('Pipeline Finisher Executor') dev_raw_data_source >> [kafka_destination, pipeline_finisher] pipeline = builder.build().configure_for_environment(cluster) # Specify timeout so that iteration of consumer is stopped after that time and # specify auto_offset_reset to get messages from beginning. consumer = cluster.kafka.consumer(consumer_timeout_ms=5000, auto_offset_reset='earliest') consumer.subscribe([topic]) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline) messages = [message.value.decode().strip() for message in consumer] assert messages == DATA
def test_principal(sdc_builder, sdc_executor, cluster, stage_attributes, keytab_format=ENCODED_KEYTAB_CONTENTS): if not cluster.kafka.is_kerberized: pytest.skip('Test runs only if Kafka is kerberized') cloudera_streamsets = getattr(cluster, 'streamsets') if keytab_format in [CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP]: if not cloudera_streamsets.credential_stores: pytest.skip( 'Test with credential function runs only if credential store was enabled' ) if keytab_format in [CREDENTIAL_FUNCTION_WITH_GROUP]: azure_keyvault = cloudera_streamsets.credential_stores.get('azure') if not azure_keyvault or not azure_keyvault.enforce_entry_group: pytest.skip( 'Test with credential function with enforce group runs only' ' if enforceEntryGroup was set to True') encoded_keytabs_for_stages = getattr(cluster.kafka, 'encoded_keytabs_for_stages', None) keytab_for_stage = (encoded_keytabs_for_stages.get('Kafka Producer') if encoded_keytabs_for_stages else None) if not keytab_for_stage: pytest.skip( 'Test runs only if --stage-keytab argument is provided for `Kafka Producer` stage' ) if keytab_format == ENCODED_KEYTAB_CONTENTS: keytab_value = keytab_for_stage.base64_encoded_keytab_contents elif keytab_format in [ CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP ]: keytab_value = keytab_for_stage.credential_function_for_keytab # Run the pipeline and verify it works as expected. topic = get_random_string() logger.debug('Kafka topic name: %s', topic) DATA = ['Hello World!' for _ in range(7)] raw_data = '\n'.join(DATA) builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', raw_data=raw_data) kafka_destination = builder.add_stage( 'Kafka Producer', library=cluster.kafka.standalone_stage_lib) if Version(sdc_builder.version) < Version('3.19'): stage_attributes.update({ 'keytab': keytab_value, 'principal': keytab_for_stage.principal }) else: if 'provide_keytab' in stage_attributes: stage_attributes[ 'provide_keytab_at_runtime'] = stage_attributes.pop( 'provide_keytab') stage_attributes.update({ 'runtime_keytab': keytab_value, 'runtime_principal': keytab_for_stage.principal }) kafka_destination.set_attributes(data_format='TEXT', topic=topic, **stage_attributes) pipeline_finisher = builder.add_stage('Pipeline Finisher Executor') dev_raw_data_source >> [kafka_destination, pipeline_finisher] pipeline = builder.build().configure_for_environment(cluster) # Specify timeout so that iteration of consumer is stopped after that time and # specify auto_offset_reset to get messages from beginning. consumer = cluster.kafka.consumer(consumer_timeout_ms=5000, auto_offset_reset='earliest') consumer.subscribe([topic]) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline) messages = [message.value.decode().strip() for message in consumer] assert messages == DATA
def test_kafka_origin_batch_max_size(sdc_builder, sdc_executor, cluster): """Check that retrieving messages from Kafka using Kafka Multitopic Consumer respects both the Batch Max Wait Time and the Max Batch Size. Batches are sent when the first of the two conditions is met. This test is checking that the Max Batch Size condition is first met. Kafka Multitopic Consumer Origin pipeline with standalone mode: kafka_multitopic_consumer >> trash """ messages = [f'message{i}' for i in range(1, 21)] expected = [f'message{i}' for i in range(1, 21)] num_batches = 2 kafka_consumer_group = get_random_string(string.ascii_letters, 10) # Build the Kafka consumer pipeline with Standalone mode. builder = sdc_builder.get_pipeline_builder() kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage( builder, cluster) produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages, 'TEXT') if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST' kafka_multitopic_consumer.set_attributes( consumer_group=kafka_consumer_group, max_batch_size_in_records=10, batch_wait_time_in_ms=30000) trash = builder.add_stage(label='Trash') kafka_multitopic_consumer >> trash kafka_consumer_pipeline = builder.build(title='Kafka Multitopic pipeline Maximum batch size threshold')\ .configure_for_environment(cluster) kafka_consumer_pipeline.configuration['shouldRetry'] = False kafka_consumer_pipeline.configuration['executionMode'] = 'STANDALONE' sdc_executor.add_pipeline(kafka_consumer_pipeline) try: # First test checking Max Batch Size is reached # Publish messages to Kafka and verify using snapshot if the same messages are received. # Start Pipeline. snapshot = sdc_executor.capture_snapshot(kafka_consumer_pipeline, start_pipeline=True, batches=num_batches, batch_size=10).snapshot records_fields = [] for snapshot_batch in snapshot.snapshot_batches: for value in snapshot_batch[kafka_consumer_pipeline[0]. instance_name].output_lanes.values(): for record in value: records_fields.append(str(record.field['text'])) assert expected == records_fields finally: sdc_executor.stop_pipeline(kafka_consumer_pipeline, force=True)
def test_security_username_and_password(sdc_builder, sdc_executor, elasticsearch, stage_attributes): """ To test the username and password configurations we create a pipeline as follows: Elasticsearch >> Wiretap Then we check different combinations of valid/invalid/empty username/password configuration values. We expect no errors when the username and password are not empty and are valid. We verify that an appropriate error happens when an invalid/empty username and/or password are set. """ if stage_attributes['with_valid_password'] is None: password = '' elif stage_attributes['with_valid_password']: password = elasticsearch.password else: password = get_random_string() if stage_attributes['with_valid_username'] is None: username = '' elif stage_attributes['with_valid_username']: username = elasticsearch.username else: username = get_random_string() index = get_random_string(string.ascii_lowercase) doc_id = get_random_string(string.ascii_lowercase) builder = sdc_builder.get_pipeline_builder() origin = builder.add_stage('Elasticsearch', type='origin') origin.query = '{"query": {"match_all": {}}}' origin.index = index wiretap = builder.add_wiretap() origin >> wiretap.destination pipeline = builder.build().configure_for_environment(elasticsearch) configured_origin = pipeline.stages.get(label=origin.label) if Version(sdc_builder.version) < Version('3.17.0'): configured_origin.configuration[ 'conf.securityConfig.securityUser'] = f'{username}:{password}' else: configured_origin.user_name = username configured_origin.password = password sdc_executor.add_pipeline(pipeline) if stage_attributes['error_code'] is None: elasticsearch.client.create_document(index=index, id=doc_id, body={"number": 1}) try: sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == 1 record = wiretap.output_records[0] assert record.field['_index'] == index assert record.field['_id'] == doc_id assert record.field['_source'] == {"number": 1} finally: elasticsearch.client.delete_index(index) else: with pytest.raises(ValidationError) as e: sdc_executor.validate_pipeline(pipeline) assert e.value.issues['issueCount'] == 1 assert e.value.issues['stageIssues'][ origin.instance_name][0]['message'].find( stage_attributes['error_code']) != -1
def test_jdbc_multitable_consumer_to_hive(sdc_builder, sdc_executor, database, cluster, table_name_characters, table_name_length): """Validate an end to end case of reading Multi-tables from JDBC source and making sure they are written to Hadoop FS. We use Hive Metadata processor for drift synchronization. The pipeline looks like: jdbc_multitable_consumer >= pipeline_finisher_executor jdbc_multitable_consumer >> expression_evaluator >> field_remover >> hive_metadata hive_metadata >> hadoop_fs hive_metadata >> hive_metastore Note: Numeric fixture of the test fails till SDC-6766 is addressed. """ # based on SDC-13915 if (isinstance(cluster, AmbariCluster) and Version(cluster.version) == Version('3.1') and Version(sdc_builder.version) < Version('3.8.1')): pytest.skip( 'Hive stages not available on HDP 3.1.0.0 for SDC versions before 3.8.1' ) # Generate two random strings to use when naming the DB tables at the source. src_table_suffix = get_random_string( string.ascii_lowercase, 6) # lowercase for db compatibility (e.g. PostgreSQL) random_table_name_1 = '{}_{}'.format( get_random_string(table_name_characters, table_name_length), src_table_suffix) random_table_name_2 = '{}_{}'.format( get_random_string(table_name_characters, table_name_length), src_table_suffix) # build the pipeline pipeline_builder = sdc_builder.get_pipeline_builder() jdbc_multitable_consumer = pipeline_builder.add_stage( 'JDBC Multitable Consumer') jdbc_multitable_consumer.set_attributes( table_configuration=[{ 'tablePattern': f'%{src_table_suffix}' }]) expression_evaluator = pipeline_builder.add_stage('Expression Evaluator') expression_evaluator.header_attribute_expressions = [{ 'attributeToSet': 'database', 'headerAttributeExpression': f'{database.database}' }, { 'attributeToSet': 'dt', 'headerAttributeExpression': "${record:value('/dt')}" }, { 'attributeToSet': 'table_name', 'headerAttributeExpression': "${record:attribute('jdbc.tables')}" }] field_remover = pipeline_builder.add_stage('Field Remover') field_remover.fields = ["/dt"] hive_metadata = pipeline_builder.add_stage('Hive Metadata') hive_metadata.set_attributes( data_format='AVRO', database_expression="${record:attribute('database')}", decimal_precision_expression=( "${record:attribute(str:concat(str:concat(" "'jdbc.', field:field()), '.precision'))}"), decimal_scale_expression=("${record:attribute(str:concat(str:concat(" "'jdbc.', field:field()), '.scale'))}"), table_name="${record:attribute('table_name')}") hadoop_fs = pipeline_builder.add_stage('Hadoop FS', type='destination') hadoop_fs.set_attributes(avro_schema_location='HEADER', data_format='AVRO', directory_in_header=True, file_type='TEXT', files_prefix='sdc-${sdc:id()}', files_suffix='avro', max_file_size=0, max_records_in_file=0, roll_attribute_name='roll', use_roll_attribute=True) hive_metastore = pipeline_builder.add_stage('Hive Metastore', type='destination') hive_metastore.set_attributes(stored_as_avro=True) pipeline_finisher_executor = pipeline_builder.add_stage( 'Pipeline Finisher Executor') jdbc_multitable_consumer >= pipeline_finisher_executor jdbc_multitable_consumer >> expression_evaluator >> field_remover >> hive_metadata hive_metadata >> hadoop_fs hive_metadata >> hive_metastore pipeline = pipeline_builder.build( title='Multi-table consumer to Hive').configure_for_environment( cluster, database) sdc_executor.add_pipeline(pipeline) tables = [] try: # create table and load data in the JDBC database for table_name in (random_table_name_1, random_table_name_2): logger.info('Creating table %s in %s database ...', table_name, database.type) table = sqlalchemy.Table( table_name, sqlalchemy.MetaData(), sqlalchemy.Column('event_id', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('order_id', sqlalchemy.Integer), sqlalchemy.Column('event_type', sqlalchemy.String(32)), sqlalchemy.Column('dt', sqlalchemy.String(20))) table.create(database.engine) tables.append(table) rows = [{ 'event_id': 1, 'order_id': 123, 'event_type': 'SHIPPED', 'dt': '2017-07-13' }, { 'event_id': 2, 'order_id': 234, 'event_type': 'ARRIVED', 'dt': '2017-07-13' }, { 'event_id': 3, 'order_id': 345, 'event_type': 'READY', 'dt': '2017-07-13' }] logger.info('Adding %s rows to %s of %s database ...', len(rows), table_name, database.type) connection = database.engine.connect() connection.execute(table.insert(), rows) # run the pipeline sdc_executor.start_pipeline(pipeline) # Check that the data shows up in Hive. hive_cursor = cluster.hive.client.cursor() for table in tables: table_name = table.name if not database.type == 'Oracle' else table.name.upper( ) logger.info('Asserting table %s', table_name) hive_cursor.execute(f'SELECT * from `{table_name}`') hive_values = [list(row) for row in hive_cursor.fetchall()] raw_values = [list(row.values()) for row in rows] assert sorted(hive_values) == sorted(raw_values) finally: for table in tables: table_name = table.name if not database.type == 'Oracle' else table.name.upper( ) logger.info('Dropping table %s in %s database ...', table_name, database.type) table.drop(database.engine) logger.info('Dropping table %s in Hive ...', table_name) hive_cursor.execute(f'DROP TABLE `{table_name}`')
def test_start_pipeline_processor(sdc_builder, sdc_executor): """Test Start Pipeline Origin/Processor. The pipeline would look like: start_pipeline1 >> start_pipeline2 >> pipeline_finisher Chain pipeline execution using start pipeline orchestrator stages. start_pipeline1 origin starts and waits till pipeline1 completes execution and then start pipeline2. """ start_pipeline_stage_label = 'Start Pipeline' metrics_output_generated = False if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline_stage_label = 'Start Pipelines' metrics_output_generated = True # Pipeline - pipeline1 pipeline1 = _create_batch_pipeline(sdc_builder, 'test_start_pipeline_processor1') sdc_executor.add_pipeline(pipeline1) # Pipeline - pipeline2 unique_title = str(uuid.uuid4()) pipeline2 = _create_batch_pipeline(sdc_builder, unique_title) sdc_executor.add_pipeline(pipeline2) # Chain Pipeline Execution Sample (start_pipeline1 >> start_pipeline2 >> pipeline_finisher) pipeline_builder = sdc_builder.get_pipeline_builder() start_pipeline1 = pipeline_builder.add_stage(start_pipeline_stage_label, type='origin') if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline1.task_name = 'task1' else: start_pipeline1.unique_task_name = 'task1' start_pipeline1.pipelines = [{ 'pipelineIdType': 'ID', 'pipelineId': pipeline1.id }] start_pipeline2 = pipeline_builder.add_stage(start_pipeline_stage_label, type='processor') if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline2.task_name = 'task2' else: start_pipeline2.unique_task_name = 'task2' start_pipeline2.pipelines = [{ 'pipelineIdType': 'TITLE', 'pipelineId': unique_title }] pipeline_finisher = pipeline_builder.add_stage( 'Pipeline Finisher Executor') start_pipeline1 >> start_pipeline2 >> pipeline_finisher pipeline = pipeline_builder.build('Chain Pipeline Execution Sample') sdc_executor.add_pipeline(pipeline) sdc_executor.validate_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot # Assert start_pipeline1 record output start_pipeline1_output = snapshot[start_pipeline1.instance_name].output assert len(start_pipeline1_output) == 1 _validate_start_pipeline_output( start_pipeline1_output[0].field['orchestratorTasks'], 'task1', pipeline1, True, metrics_output_generated) # Assert start_pipeline2 record output - start_pipeline2 output should contain output of both pipelines start_pipeline2_output = snapshot[start_pipeline2.instance_name].output assert len(start_pipeline2_output) == 1 _validate_start_pipeline_output( start_pipeline2_output[0].field['orchestratorTasks'], 'task1', pipeline1, True, metrics_output_generated) _validate_start_pipeline_output( start_pipeline2_output[0].field['orchestratorTasks'], 'task2', pipeline2, True, metrics_output_generated)
def test_principal(sdc_builder, sdc_executor, cluster, stage_attributes, keytab_format=ENCODED_KEYTAB_CONTENTS): if not cluster.kafka.is_kerberized: pytest.skip('Test runs only if Kafka is kerberized') cloudera_streamsets = getattr(cluster, 'streamsets') if keytab_format in [CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP]: if not cloudera_streamsets.credential_stores: pytest.skip( 'Test with credential function runs only if credential store was enabled' ) if keytab_format in [CREDENTIAL_FUNCTION_WITH_GROUP]: azure_keyvault = cloudera_streamsets.credential_stores.get('azure') if not azure_keyvault or not azure_keyvault.enforce_entry_group: pytest.skip( 'Test with credential function with enforce group runs only' ' if enforceEntryGroup was set to True') encoded_keytabs_for_stages = getattr(cluster.kafka, 'encoded_keytabs_for_stages', None) keytab_for_stage = (encoded_keytabs_for_stages.get('Kafka Consumer') if encoded_keytabs_for_stages else None) if not keytab_for_stage: pytest.skip( 'Test runs only if --stage-keytab argument is provided for `Kafka Consumer` stage' ) if keytab_format == ENCODED_KEYTAB_CONTENTS: keytab_value = keytab_for_stage.base64_encoded_keytab_contents elif keytab_format in [ CREDENTIAL_FUNCTION, CREDENTIAL_FUNCTION_WITH_GROUP ]: keytab_value = keytab_for_stage.credential_function_for_keytab MESSAGE = 'Hello World from SDC & DPM!' EXPECTED = {'text': 'Hello World from SDC & DPM!'} # Build the Kafka consumer pipeline with Standalone mode. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') topic_name = get_random_string() kafka_consumer = builder.add_stage( 'Kafka Consumer', library=cluster.kafka.standalone_stage_lib) if Version(sdc_builder.version) < Version('3.19'): stage_attributes.update({ 'keytab': keytab_value, 'principal': keytab_for_stage.principal }) else: if 'provide_keytab' in stage_attributes: stage_attributes[ 'provide_keytab_at_runtime'] = stage_attributes.pop( 'provide_keytab') stage_attributes.update({ 'runtime_keytab': keytab_value, 'runtime_principal': keytab_for_stage.principal }) # Default stage configuration. kafka_consumer.set_attributes(auto_offset_reset='EARLIEST', batch_wait_time_in_ms=20000, data_format='TEXT', topic=topic_name, **stage_attributes) wiretap = builder.add_wiretap() kafka_consumer >> wiretap.destination pipeline = builder.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) try: # Publish messages to Kafka and verify using wiretap if the same messages are received. producer = cluster.kafka.producer() producer.send(topic_name, MESSAGE.encode()) # Start Pipeline. sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 1, timeout_sec=120) sdc_executor.stop_pipeline(pipeline) # Verify wiretap data. records = [record.field for record in wiretap.output_records] assert [EXPECTED] == records finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline)
def test_kafka_origin_not_saving_offset(sdc_builder, sdc_executor, cluster): """Ensure that we read all the data, even when a pipeline fails - thus no records are "auto committed". The test runs the same pipeline twice - once with failure and second time with success and ensures that the second run see all the records. The pipeline reads from Kafka and uses delay processor to model longer processing time (so that Kafka's auto commit takes place) and then jython processor to generate pipeline failure (1/0). """ topic = get_random_string(string.ascii_letters, 10) builder = sdc_builder.get_pipeline_builder() origin = get_kafka_multitopic_consumer_stage(builder, cluster) origin.topic_list = [topic] origin.consumer_group = get_random_string(string.ascii_letters, 10) origin.batch_wait_time_in_ms = 100 if Version(sdc_builder.version) < Version('3.7.0'): origin.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: origin.auto_offset_reset = 'EARLIEST' delay = builder.add_stage('Delay') delay.delay_between_batches = 5 * 1000 script = builder.add_stage('Jython Evaluator', type='processor') script.script = """1/${DIVISOR} for record in sdc.records: try: sdc.output.write(record) except Exception as e: sdc.error.write(record, str(e)) """ wiretap = builder.add_wiretap() origin >> delay >> script >> wiretap.destination pipeline = builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False pipeline.configuration['executionMode'] = 'STANDALONE' pipeline.add_parameters(DIVISOR='0') sdc_executor.add_pipeline(pipeline) # Produce one message producer = cluster.kafka.producer() producer.send(topic, 'Super Secret Message'.encode()) producer.flush() try: # Start our pipeline - it should fail sdc_executor.start_pipeline(pipeline, runtime_parameters={ 'DIVISOR': 0 }).wait_for_status('RUN_ERROR', ignore_errors=True) # Adding second message so that the topic have at least one new message, so that getting an older # versions won't time out but returns immediately. producer = cluster.kafka.producer() producer.send(topic, 'Not So Super Secret Message'.encode()) producer.flush() # Now run the pipeline second time and it should succeed sdc_executor.start_pipeline(pipeline, runtime_parameters={'DIVISOR': 1}) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 2) # Now this should still read both records records = wiretap.output_records assert len(records) == 2 assert records[0].field['text'] == 'Super Secret Message' assert records[1].field['text'] == 'Not So Super Secret Message' finally: sdc_executor.stop_pipeline(pipeline)
def test_kafka_origin_save_offset(sdc_builder, sdc_executor, cluster): """ Above SDC-10501 introduced a bug which does not commit offset when the number of records is less than the max batch size. This process 5 records for the 1st run, stop pipeline, and run again to process 3 records for the 2nd run. 2nd run should process 3 records as the offset should be saved after the 1st run. Kafka Multitopic Origin >> Trash (Run twice) """ topic = get_random_string(string.ascii_letters, 10) builder = sdc_builder.get_pipeline_builder() kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage( builder, cluster) kafka_multitopic_consumer.topic_list = [topic] kafka_multitopic_consumer.consumer_group = get_random_string( string.ascii_letters, 10) kafka_multitopic_consumer.batch_wait_time_in_ms = 100 if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST' wiretap = builder.add_wiretap() kafka_multitopic_consumer >> wiretap.destination pipeline = builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False pipeline.configuration['executionMode'] = 'STANDALONE' sdc_executor.add_pipeline(pipeline) # Produce 5 messages messages = [f'message{i}' for i in range(0, 5)] produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages, 'TEXT') try: # Start the pipeline, read one batch and stop. sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 5) sdc_executor.stop_pipeline(pipeline) # Check if the pipeline processed 5 records records = [ f'{record.field["text"]}' for record in wiretap.output_records ] assert len(records) == 5 assert sorted(messages) == sorted(records) # Produce another 3 messages messages2 = [f'message{i}' for i in range(5, 8)] produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages2, 'TEXT') # Resetting wiretap to clean up data from previous runs wiretap.reset() # Run the pipeline second time sdc_executor.start_pipeline(pipeline) time.sleep(10) sdc_executor.stop_pipeline(pipeline) # 2nd run should processed only 3 records records2 = [ f'{record.field["text"]}' for record in wiretap.output_records ] assert len(records2) == 3 assert sorted(messages2) == sorted(records2) finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline)
def test_kudu_lookup_apply_default(sdc_builder, sdc_executor, cluster): """ Test when row is found which matches with primary key, but its column that lookup processor needs to return doesn't have value. When default value is configured, apply the value. dev_raw_data_source >> record_deduplicator >> kudu >> trash record_deduplicator >> to_error """ if not hasattr(cluster, 'kudu'): pytest.skip('Kudu tests only run against clusters with the Kudu service present.') tour_de_france_contenders = [dict(favorite_rank=1), dict(favorite_rank=2)] raw_data = ''.join([json.dumps(contender) for contender in tour_de_france_contenders]) key_columns_mapping = [dict(field='/favorite_rank', columnName='rank')] column_to_output_field_mapping = [dict(columnName='name', field='/name', defaultValue=None), dict(columnName='wins', field='/wins', defaultValue='0')] kudu_table_name = get_random_string(string.ascii_letters, 10) kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT) # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data) kudu = builder.add_stage('Kudu Lookup', type='processor').set_attributes(kudu_masters=kudu_master_address, kudu_table_name='{}.{}'.format('impala::default', kudu_table_name), key_columns_mapping=key_columns_mapping, column_to_output_field_mapping=column_to_output_field_mapping, case_sensitive=True, ignore_missing_value=True) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') trash = builder.add_stage('Trash') dev_raw_data_source >> record_deduplicator >> kudu >> trash record_deduplicator >> to_error pipeline = builder.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) metadata = sqlalchemy.MetaData() tdf_contenders_table = sqlalchemy.Table(kudu_table_name, metadata, sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('name', sqlalchemy.String), sqlalchemy.Column('wins', sqlalchemy.Integer), impala_partition_by='HASH PARTITIONS 16', impala_stored_as='KUDU', impala_table_properties={ 'kudu.master_addresses': kudu_master_address, 'kudu.num_tablet_replicas': '1' }) try: logger.info('Creating Kudu table %s ...', kudu_table_name) engine = cluster.kudu.engine tdf_contenders_table.create(engine) conn = engine.connect() conn.execute(tdf_contenders_table.insert(), [ {'rank': 1, 'name': None, 'wins': None}, {'rank': 2, 'name': None, 'wins': None}]) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) for result in snapshot[kudu.instance_name].output: if Version(sdc_executor.version) >= Version('3.2.0.0'): assert 'name' not in result.field else: assert result.field['name'].value == 'None' assert int(result.field['wins'].value) == 0 finally: logger.info('Dropping Kudu table %s ...', kudu_table_name) tdf_contenders_table.drop(engine)
def test_kudu_lookup_decimal_type(sdc_builder, sdc_executor, cluster): """ After inserting rows in a Kudu table containing a decimal type column check that decimal type column is correctly retrieved by Kudu processor dev_raw_data_source >> kudu >> trash """ if not hasattr(cluster, 'kudu'): pytest.skip('Kudu tests only run against clusters with the Kudu service present.') if not Version(cluster.kudu.version) >= Version('1.7.0'): pytest.skip(f'Test only designed to run on Kudu version >= 1.7.0, but found {cluster.kudu.version}') tour_de_france_contenders = [dict(rank=1, weight=150.58), dict(rank=2, weight=140.11)] raw_data = ''.join([json.dumps(contender) for contender in tour_de_france_contenders]) key_columns_mapping = [dict(field='/rank', columnName='rank')] column_to_output_field_mapping = [dict(columnName='rank', field='/rank'), dict(columnName='weight', field='/weight', defaultValue='0')] kudu_table_name = get_random_string(string.ascii_letters, 10) kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT) # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data) kudu = builder.add_stage('Kudu Lookup', type='processor').set_attributes(kudu_masters=kudu_master_address, kudu_table_name='{}.{}'.format('impala::default', kudu_table_name), key_columns_mapping=key_columns_mapping, column_to_output_field_mapping=column_to_output_field_mapping, case_sensitive=True, ignore_missing_value=True) trash = builder.add_stage('Trash') dev_raw_data_source >> kudu >> trash pipeline = builder.build().configure_for_environment(cluster) sdc_executor.add_pipeline(pipeline) metadata = sqlalchemy.MetaData() tdf_contenders_table = sqlalchemy.Table(kudu_table_name, metadata, sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('weight', sqlalchemy.DECIMAL(5,2)), impala_partition_by='HASH PARTITIONS 16', impala_stored_as='KUDU', impala_table_properties={ 'kudu.master_addresses': kudu_master_address, 'kudu.num_tablet_replicas': '1' }) try: logger.info('Creating Kudu table %s ...', kudu_table_name) engine = cluster.kudu.engine tdf_contenders_table.create(engine) conn = engine.connect() conn.execute(tdf_contenders_table.insert(), tour_de_france_contenders) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) i = 0 for result in snapshot[kudu.instance_name].output: assert result.field['weight'].value == round(Decimal(tour_de_france_contenders[i]['weight']), 2) i += 1 finally: logger.info('Dropping Kudu table %s ...', kudu_table_name) tdf_contenders_table.drop(engine)
def test_kudu_destination_decimal_type(sdc_builder, sdc_executor, cluster): """Simple Dev Raw Data Source to Kudu pipeline inserting column of decimal type and checking later on decimal type is correctly stored by querying Kudu database dev_raw_data_source >> kudu """ if not hasattr(cluster, 'kudu'): pytest.skip('Kudu tests only run against clusters with the Kudu service present.') if not Version(cluster.kudu.version) >= Version('1.7.0'): pytest.skip(f'Test only designed to run on Kudu version >= 1.7.0, but found {cluster.kudu.version}') # Generate some data. tour_de_france_contenders = [dict(favorite_rank=1, name='Chris Froome', wins=3, weight=153.22), dict(favorite_rank=2, name='Greg LeMond', wins=3, weight=158.73), dict(favorite_rank=4, name='Vincenzo Nibali', wins=1, weight=144), dict(favorite_rank=3, name='Nairo Quintana', wins=0, weight=165.34)] raw_data = '\n'.join([json.dumps(contender) for contender in tour_de_france_contenders]) field_to_column_mapping = [dict(field='/favorite_rank', columnName='rank'), dict(field='/name', columnName='name'), dict(field='/wins', columnName='wins'), dict(field='/weight', columnName='weight')] kudu_table_name = get_random_string(string.ascii_letters, 10) kudu_master_address = '{}:{}'.format(cluster.server_host, DEFAULT_KUDU_PORT) # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data) kudu = builder.add_stage('Kudu', type='destination').set_attributes(table_name='{}.{}'.format('impala::default', kudu_table_name), default_operation='INSERT', field_to_column_mapping=field_to_column_mapping) dev_raw_data_source >> kudu pipeline = builder.build().configure_for_environment(cluster) pipeline.delivery_guarantee = 'AT_MOST_ONCE' # We want to write data once and then stop, but Dev Raw Data Source will keep looping, so we set the rate limit to # a low value and will rely upon pipeline metrics to know when to stop the pipeline. pipeline.rate_limit = 4 metadata = sqlalchemy.MetaData() tdf_contenders_table = sqlalchemy.Table(kudu_table_name, metadata, sqlalchemy.Column('rank', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('name', sqlalchemy.String), sqlalchemy.Column('wins', sqlalchemy.Integer), sqlalchemy.Column('weight', sqlalchemy.DECIMAL(5, 2)), impala_partition_by='HASH PARTITIONS 16', impala_stored_as='KUDU', impala_table_properties={ 'kudu.master_addresses': kudu_master_address, 'kudu.num_tablet_replicas': '1' }) try: logger.info('Creating Kudu table %s ...', kudu_table_name) engine = cluster.kudu.engine tdf_contenders_table.create(engine) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(tour_de_france_contenders)) sdc_executor.stop_pipeline(pipeline) connection = engine.connect() result = connection.execute(sqlalchemy.sql.select([tdf_contenders_table]).order_by('rank')) result_list = list(result) sorted_tour_de_france_contenders = [tuple([item['favorite_rank'], item['name'], item['wins'], round(Decimal(item['weight']), 2)]) for item in sorted(tour_de_france_contenders, key=lambda key: key['favorite_rank'])] assert result_list == sorted_tour_de_france_contenders finally: logger.info('Dropping Kudu table %s ...', kudu_table_name) tdf_contenders_table.drop(engine)
def test_kudu_destination_unixtime_micro_datatype(sdc_builder, sdc_executor, cluster): """ Test Kudu's UNIXTIME_MICRO data type support. dev_raw_data_source >> kudu """ if not hasattr(cluster, 'kudu'): pytest.skip('Kudu tests only run against clusters with the Kudu service present.') if Version(cluster.version) < Version('cdh5.12.0'): pytest.skip('Test requires CDH 5.12.0+ to run') # Generate some data. Kudu does not store microsecond so set it 0. now = datetime.now().replace(microsecond=0) now_millisecond = time.mktime(now.timetuple()) * 1000 input_data = [dict(id=1, time=now_millisecond)] raw_data = ''.join([json.dumps(contender) for contender in input_data]) field_to_column_mapping = [dict(field='/id', columnName='id'), dict(field='/time', columnName='unixtime_micro')] kudu_table_name = get_random_string(string.ascii_letters, 10) kudu_master_address = f'{cluster.server_host}:{DEFAULT_KUDU_PORT}' # Build the pipeline. builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=raw_data) kudu = builder.add_stage('Kudu', type='destination').set_attributes(table_name='{}.{}'.format('impala::default', kudu_table_name), default_operation='INSERT', field_to_column_mapping=field_to_column_mapping) dev_raw_data_source >> kudu pipeline = builder.build().configure_for_environment(cluster) pipeline.delivery_guarantee = 'AT_MOST_ONCE' # We want to write data once and then stop, but Dev Raw Data Source will keep looping, so we set the rate limit to # a low value and will rely upon pipeline metrics to know when to stop the pipeline. pipeline.rate_limit = 4 metadata = sqlalchemy.MetaData() test_table = sqlalchemy.Table(kudu_table_name, metadata, sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('unixtime_micro', sqlalchemy.TIMESTAMP), impala_partition_by='HASH PARTITIONS 16', impala_stored_as='KUDU', impala_table_properties={ 'kudu.master_addresses': kudu_master_address, 'kudu.num_tablet_replicas': '1' }) try: logger.info('Creating Kudu table %s ...', kudu_table_name) engine = cluster.kudu.engine test_table.create(engine) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(input_data)) sdc_executor.stop_pipeline(pipeline) connection = engine.connect() result = connection.execute(sqlalchemy.sql.select([test_table])).fetchone() assert list(result) == [1, now] finally: logger.info('Dropping Kudu table %s ...', kudu_table_name) test_table.drop(engine)
def test_wait_for_completion_processor(sdc_builder, sdc_executor): """Test Wait For Pipeline Completion Processor.""" start_pipeline_stage_label = 'Start Pipeline' wait_for_completion_stage_label = 'Wait for Pipeline Completion' if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline_stage_label = 'Start Pipelines' wait_for_completion_stage_label = 'Wait for Pipelines' # Pipeline - pipeline1 pipeline1 = _create_batch_pipeline(sdc_builder, 'test_wait_for_completion_processor') sdc_executor.add_pipeline(pipeline1) # Pipeline - pipeline2 pipeline2 = _create_batch_pipeline(sdc_builder, 'test_wait_for_completion_processor2') sdc_executor.add_pipeline(pipeline2) pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source1 = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source1.stop_after_first_batch = True start_pipeline1 = pipeline_builder.add_stage(start_pipeline_stage_label, type='processor') if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline1.task_name = 'task1' else: start_pipeline1.unique_task_name = 'task1' start_pipeline1.run_in_background = True start_pipeline1.pipelines = [{ 'pipelineIdType': 'ID', 'pipelineId': pipeline1.id }] start_pipeline2 = pipeline_builder.add_stage(start_pipeline_stage_label, type='processor') if Version(sdc_builder.version) >= Version('3.17.0'): start_pipeline2.task_name = 'task2' else: start_pipeline2.unique_task_name = 'task2' start_pipeline2.run_in_background = True start_pipeline2.pipelines = [{ 'pipelineIdType': 'ID', 'pipelineId': pipeline2.id }] wait_for_pipeline_completion = pipeline_builder.add_stage( wait_for_completion_stage_label) trash = pipeline_builder.add_stage('Trash') dev_raw_data_source1 >> [start_pipeline1, start_pipeline2] start_pipeline1 >> wait_for_pipeline_completion start_pipeline2 >> wait_for_pipeline_completion wait_for_pipeline_completion >> trash pipeline = pipeline_builder.build('Chain Pipeline Execution Sample2') sdc_executor.add_pipeline(pipeline) sdc_executor.validate_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot # Assert start_pipeline1 record output start_pipeline1_output = snapshot[start_pipeline1.instance_name].output assert len(start_pipeline1_output) == 1 _validate_start_pipeline_output( start_pipeline1_output[0].field['orchestratorTasks'], 'task1', pipeline1, False) # Assert start_pipeline2 record output start_pipeline2_output = snapshot[start_pipeline2.instance_name].output assert len(start_pipeline2_output) == 1 _validate_start_pipeline_output( start_pipeline2_output[0].field['orchestratorTasks'], 'task2', pipeline2, False) # Assert wait_for_pipeline_completion record output wait_for_pipeline_completion_output = snapshot[ wait_for_pipeline_completion.instance_name].output assert len(wait_for_pipeline_completion_output) == 1 _validate_start_pipeline_output( wait_for_pipeline_completion_output[0].field['orchestratorTasks'], 'task1', pipeline1, True) _validate_start_pipeline_output( wait_for_pipeline_completion_output[0].field['orchestratorTasks'], 'task2', pipeline2, True)
def test_data_types(sdc_builder, sdc_executor, mongodb, input, converter_type, improve_types, expected): if Version(sdc_builder.version) <= Version('4.0.2') and improve_types: pytest.skip( 'Improved Type Conversion is not present on that SDC version') database = get_random_string(string.ascii_letters, 5) collection = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() pipeline_builder.add_error_stage('Discard') origin = pipeline_builder.add_stage('Dev Raw Data Source') origin.set_attributes(data_format='JSON', stop_after_first_batch=True, raw_data=json.dumps({"value": input})) converter = pipeline_builder.add_stage('Field Type Converter') converter.set_attributes(conversion_method='BY_FIELD', field_type_converter_configs=[{ 'fields': ['/value'], 'targetType': converter_type, 'dataLocale': 'en,US', 'dateFormat': 'YYYY_MM_DD_HH_MM_SS', 'zonedDateTimeFormat': 'ISO_OFFSET_DATE_TIME', 'scale': 2 }]) expression_evaluator = pipeline_builder.add_stage('Expression Evaluator') # MongoDB destination uses the CRUD operation in the sdc.operation.type record header attribute when writing # to MongoDB. Value 4 specified below is for UPSERT. expression_evaluator.header_attribute_expressions = [{ 'attributeToSet': 'sdc.operation.type', 'headerAttributeExpression': '1' }] mongodb_dest = pipeline_builder.add_stage('MongoDB', type='destination') if Version(sdc_builder.version) > Version('4.0.2'): mongodb_dest.set_attributes(improve_type_conversion=improve_types) mongodb_dest.set_attributes(database=database, collection=collection) origin >> converter >> expression_evaluator >> mongodb_dest pipeline = pipeline_builder.build().configure_for_environment(mongodb) pipeline.configuration["shouldRetry"] = False sdc_executor.add_pipeline(pipeline) try: sdc_executor.start_pipeline(pipeline).wait_for_finished() # Run pipeline and read from MongoDB to assert mongodb_documents = [ doc for doc in mongodb.engine[mongodb_dest.database][ mongodb_dest.collection].find() ] assert len(mongodb_documents) == 1 doc = mongodb_documents[0] if converter_type == 'FLOAT' and improve_types: assert pytest.approx(doc['value']) == expected elif converter_type == 'DECIMAL' and improve_types: assert pytest.approx(Decimal(str(doc['value']))) == expected else: assert doc['value'] == expected finally: logger.info('Dropping %s database...', mongodb_dest.database) mongodb.engine.drop_database(mongodb_dest.database)
def test_kafka_origin_save_offset(sdc_builder, sdc_executor, cluster): """ Above SDC-10501 introduced a bug which does not commit offset when the number of records is less than the max batch size. This process 5 records for the 1st run, stop pipeline, and run again to process 3 records for the 2nd run. 2nd run should process 3 records as the offset should be saved after the 1st run. Kafka Multitopic Origin >> Trash (Run twice) """ topic = get_random_string(string.ascii_letters, 10) builder = sdc_builder.get_pipeline_builder() kafka_multitopic_consumer = get_kafka_multitopic_consumer_stage( builder, cluster) kafka_multitopic_consumer.topic_list = [topic] kafka_multitopic_consumer.consumer_group = get_random_string( string.ascii_letters, 10) kafka_multitopic_consumer.batch_wait_time_in_ms = 100 if Version(sdc_builder.version) < Version('3.7.0'): kafka_multitopic_consumer.configuration_properties = [{ 'key': 'auto.offset.reset', 'value': 'earliest' }] else: kafka_multitopic_consumer.auto_offset_reset = 'EARLIEST' trash = builder.add_stage(label='Trash') kafka_multitopic_consumer >> trash pipeline = builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False pipeline.configuration['executionMode'] = 'STANDALONE' sdc_executor.add_pipeline(pipeline) # Produce 5 messages messages = [f'message{i}' for i in range(0, 5)] produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages, 'TEXT') try: # Start the pipeline, read one batch and stop. snapshot1 = sdc_executor.capture_snapshot(pipeline, batches=1, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) # Check if the pipeline processed 5 records records = snapshot1[kafka_multitopic_consumer].output assert len(records) == 5 # Produce another 3 messages messages2 = [f'message{i}' for i in range(5, 8)] produce_kafka_messages_list(kafka_multitopic_consumer.topic_list[0], cluster, messages2, 'TEXT') # Run the pipeline second time snapshot2 = sdc_executor.capture_snapshot(pipeline, batches=1, start_pipeline=True).snapshot # 2nd run should processed only 3 records records2 = snapshot2[kafka_multitopic_consumer].output assert len(records2) == 3 finally: sdc_executor.stop_pipeline(pipeline)