def test_datalake_destination_max_records(sdc_builder, sdc_executor, azure, adls_version): """Test for Data Lake Store target stage setting max number of records per file as 1. The pipeline looks like: Data Lake Store Destination pipeline: dev_data_generator >> azure_data_lake_store_destination """ directory_name = get_random_string(string.ascii_letters, 10) files_prefix = get_random_string(string.ascii_letters, 10) files_suffix = 'json' raw_data = [ dict(id=1, name='abc'), dict(id=2, name='def'), dict(id=3, name='ghi'), dict(id=4, name='jkl'), dict(id=5, name='mno'), dict(id=6, name='pqr'), dict(id=7, name='stu'), dict(id=8, name='vwx'), dict(id=9, name='y'), dict(id=10, name='z') ] dev_raw_data_source_data = ''.join(json.dumps(d) for d in raw_data) pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', raw_data=dev_raw_data_source_data, stop_after_first_batch=True) azure_data_lake_store = pipeline_builder.add_stage( name=ADLS_GEN_STAGELIBS[adls_version].target_stagelib) azure_data_lake_store.set_attributes( data_format='JSON', directory_template=directory_name if adls_version == ADLS_LEGACY else f'/{directory_name}', files_prefix=files_prefix, files_suffix=files_suffix, max_records_in_file=1) dev_raw_data_source >> azure_data_lake_store pipeline = pipeline_builder.build().configure_for_environment(azure) sdc_executor.add_pipeline(pipeline) dl_fs = azure.datalake.file_system try: sdc_executor.start_pipeline(pipeline).wait_for_finished() dl_files = dl_fs.ls(directory_name) assert len(dl_files) == len(raw_data) for dl_file in dl_files: dl_file_name = dl_file.split('/')[-1] assert dl_file_name.startswith( files_prefix) and dl_file_name.endswith(files_suffix) dl_file_contents = [ json.loads(dl_fs.cat(dl_file).decode()) for dl_file in dl_files ] assert sorted(dl_file_contents, key=itemgetter('id')) == sorted(raw_data, key=itemgetter('id')) finally: dl_files = dl_fs.ls(directory_name) logger.info( 'Azure Data Lake directory %s and underlying files will be deleted.', directory_name) # Note: Non-empty directory is not allowed to be removed, hence remove all files first. for dl_file in dl_files: dl_fs.rm(dl_file) dl_fs.rmdir(directory_name)
def test_data(): yield [{ "text": "Record1", "index": get_random_string(string.ascii_letters, 10).lower(), "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": get_random_string(string.ascii_letters, 10).lower(), }, { "text": "Record2", "index": get_random_string(string.ascii_letters, 10).lower(), "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": get_random_string(string.ascii_letters, 10).lower(), }, { "text": "Record3", "index": get_random_string(string.ascii_letters, 10).lower(), "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": get_random_string(string.ascii_letters, 10).lower(), }, { "text": "Record4", "index": get_random_string(string.ascii_letters, 10).lower(), "mapping": get_random_string(string.ascii_letters, 10).lower(), "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": None }]
def test_sql_server_cdc_with_cdc_schema_name(sdc_builder, sdc_executor, database, no_of_threads): """Test for SQL Server CDC origin stage when schema change is enabled. We do so by capturing Insert Operation on CDC enabled table(s) using SQL Server CDC Origin and having a pipeline which reads that data using SQL Server CDC origin stage. The records in the pipeline will be stored in SQL Server table using JDBC Producer. While the pipeline is running the source table schema is changed by dropping or adding the columns, the dest table will be dropping or adding the columns respectively. Data is then asserted for what is captured at SQL Server Job and what we read in the pipeline. The pipeline looks like: sql_server_cdc_origin >> jdbc_producer """ schema_name = DEFAULT_SCHEMA_NAME pipeline_builder = sdc_builder.get_pipeline_builder() sql_server_cdc = pipeline_builder.add_stage('SQL Server CDC Client') sql_server_cdc.set_attributes( allow_late_tables=True, enable_schema_changes_event=True, # when allow_late_tables = true, the pipeline runs one background thread # to spool the list of cdc tables maximum_pool_size=no_of_threads + 1, minimum_idle_connections=no_of_threads + 1, new_table_discovery_interval='${1 * SECONDS}', number_of_threads=no_of_threads) dest_table_name = get_random_string(string.ascii_uppercase, 9) dest_table = create_table(database, DEFAULT_SCHEMA_NAME, dest_table_name) jdbc_producer = pipeline_builder.add_stage('JDBC Producer') jdbc_producer.set_attributes(default_operation='INSERT', field_to_column_mapping=[], schema_name=DEFAULT_SCHEMA_NAME, table_name=dest_table_name) sql_server_cdc >> jdbc_producer pipeline = pipeline_builder.build().configure_for_environment(database) sdc_executor.add_pipeline(pipeline) try: tables = [] no_of_records = 5 rows_in_database = setup_sample_data(no_of_threads * no_of_records) for index in range(0, no_of_threads): table_name = get_random_string(string.ascii_lowercase, 20) # split the rows_in_database into no_of_records for each table # e.g. for no_of_records=5, the first table inserts rows_in_database[0:5] # and the secord table inserts rows_in_database[5:10] table = setup_table( database, DEFAULT_SCHEMA_NAME, table_name, rows_in_database[(index * no_of_records):((index + 1) * no_of_records)]) tables.append(table) # wait for data captured by cdc jobs in sql server before starting the pipeline ct_table_name = f'{DEFAULT_SCHEMA_NAME}_{table_name}_CT' wait_for_data_in_table(ct_table_name, no_of_records, 'cdc', database) sdc_executor.start_pipeline(pipeline) wait_for_data_in_table(dest_table_name, no_of_records * no_of_threads, DEFAULT_SCHEMA_NAME, database) assert_table_replicated(database, rows_in_database, DEFAULT_SCHEMA_NAME, dest_table_name) # add the new column to the last input table connection = database.engine.connect() logger.info('Adding the column new_column varchar(10) on %s.%s...', schema_name, table_name) connection.execute( f'ALTER TABLE {table_name} ADD new_column VARCHAR(10)') logger.info('Adding the column new_column varchar(10) on %s.%s...', schema_name, dest_table_name) connection.execute( f'ALTER TABLE {dest_table_name} ADD new_column VARCHAR(10)') logger.info('Restarting CDC on table %s', table_name) connection.execute(f'EXEC sys.sp_cdc_enable_table ' f'@source_schema=N\'{DEFAULT_SCHEMA_NAME}\', ' f'@source_name=N\'{table_name}\',' f'@role_name = NULL, ' f'@capture_instance={schema_name}_{table_name}_2') sleep(1) logger.info("Enabled _2 CT") connection.execute(f'EXEC sys.sp_cdc_disable_table ' f'@source_schema=N\'{DEFAULT_SCHEMA_NAME}\', ' f'@source_name=N\'{table_name}\',' f'@capture_instance={schema_name}_{table_name}') sleep(1) logger.info("Disabled CT") table = sqlalchemy.Table(table_name, sqlalchemy.MetaData(), sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True, autoincrement=False), sqlalchemy.Column('name', sqlalchemy.String(25)), sqlalchemy.Column('dt', sqlalchemy.String(25)), sqlalchemy.Column('new_column', sqlalchemy.String(10)), schema=schema_name) new_sample_data = [ { 'id': counter, 'name': get_random_string(string.ascii_lowercase, 20), 'dt': '2017-05-05', 'new_column': get_random_string(string.ascii_lowercase, 10) } # start with the last counter of rows_in_data to the number of records for counter in range(no_of_threads * no_of_records, (no_of_threads + 1) * no_of_records) ] logger.info('Adding %s rows into %s.%s...', len(new_sample_data), schema_name, table_name) connection.execute(table.insert(), new_sample_data) # adjust sample data by adding new_columns: None and add new sample data to the list rows_in_database.extend(new_sample_data) # WARNING! the schema change is not captured by JDBC Producer for data in rows_in_database: data.update(new_column=None) ct2_table_name = f'{DEFAULT_SCHEMA_NAME}_{table_name}_2_CT' # wait for the completion of the next batch wait_for_data_in_table(ct2_table_name, no_of_records, 'cdc', database) sdc_executor.stop_pipeline(pipeline) assert_table_replicated(database, rows_in_database, DEFAULT_SCHEMA_NAME, dest_table_name) finally: logger.info('Dropping table %s in %s database...', table, database.type) tables.append(dest_table) for table in tables: logger.info('Dropping table %s in %s database...', table, database.type) table.drop(database.engine)
def test_hbase_lookup_processor_get_row(sdc_builder, sdc_executor, cluster): """HBase Lookup processor test. pipeline will be poroperly configured, will get the expected rows dev_raw_data_source >> hbase_lookup >> trash """ # Generate some silly data. bike_races = [ dict(name='Tour de France', first_edition='1903'), dict(name='Giro d Italia', first_edition='1909'), dict(name='Vuelta a Espana', first_edition='1935') ] expected = [(b'Giro d Italia', { b'info:first_edition': b'1909' }), (b'Tour de France', { b'info:first_edition': b'1903' }), (b'Vuelta a Espana', { b'info:first_edition': b'1935' })] # Convert to raw data for the Dev Raw Data Source. raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races) # Generate HBase Lookup's attributes. lookup_parameters = [ dict(rowExpr="${record:value('/text')}", columnExpr='info:first_edition', outputFieldPath='/founded', timestampExpr='') ] # Get random table name to avoid collisions. table_name = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Source stage. dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data) # Create HBase Lookup processor. hbase_lookup = pipeline_builder.add_stage('HBase Lookup') hbase_lookup.set_attributes(lookup_parameters=lookup_parameters, table_name=table_name) # Create trash destination. trash = pipeline_builder.add_stage('Trash') # Build pipeline. dev_raw_data_source >> hbase_lookup >> trash pipeline = pipeline_builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) try: logger.info('Creating HBase table %s ...', table_name) cluster.hbase.client.create_table(name=table_name, families={'info': {}}) # Use HappyBase's `Batch` instance to avoid unnecessary calls to HBase. batch = cluster.hbase.client.table(table_name).batch() for bike_race in bike_races: # Use of str.encode() below is because HBase (and HappyBase) speaks in byte arrays. batch.put( bike_race['name'].encode(), {b'info:first_edition': bike_race['first_edition'].encode()}) batch.send() # Take a pipeline snapshot. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) # Validate output. assert [ dict(name=record.value2['text'], first_edition=record.value2['founded']) for record in snapshot[hbase_lookup.instance_name].output ] == bike_races # Validate output. result_list = list(cluster.hbase.client.table(table_name).scan()) assert result_list == expected finally: # Delete HBase table. logger.info('Deleting HBase table %s ...', table_name) cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_postgres_cdc_client_filtering_table(sdc_builder, sdc_executor, database): """ Test filtering for inserts/updates/deletes to a Postgres table 1. Random table names for "table_allow", "table_deny" 2. Filter OUT anything for "table_deny" 3. Insert/update/delete for both tables 4. Should see updates for "table_allow" only The pipeline looks like: postgres_cdc_client >> trash """ if not database.is_cdc_enabled: pytest.skip('Test only runs against PostgreSQL with CDC enabled.') table_name_allow = get_random_string(string.ascii_lowercase, 20) table_name_deny = get_random_string(string.ascii_lowercase, 20) pipeline_builder = sdc_builder.get_pipeline_builder() postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client') replication_slot_name = get_random_string(string.ascii_lowercase, 10) postgres_cdc_client.set_attributes(remove_replication_slot_on_close=False, replication_slot=replication_slot_name, schema_table_configs=[{ 'schema': 'public' }, { 'exclude_pattern': table_name_deny }, { 'table': table_name_allow }]) trash = pipeline_builder.add_stage('Trash') postgres_cdc_client >> trash pipeline = pipeline_builder.build().configure_for_environment(database) sdc_executor.add_pipeline(pipeline) try: # Database operations done after pipeline start will be captured by CDC. # Hence start the pipeline but do not wait for the capture to be finished. snapshot_command = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, wait=False) # Create table and then perform insert, update and delete operations. table_allow = _create_table_in_database(table_name_allow, database) table_deny = _create_table_in_database(table_name_deny, database) connection = database.engine.connect() expected_operations_data = _insert(connection=connection, table=table_allow) expected_operations_data += _update(connection=connection, table=table_allow) expected_operations_data += _delete(connection=connection, table=table_allow) actual_operations_data = expected_operations_data.copy() actual_operations_data += _insert(connection=connection, table=table_deny) actual_operations_data += _update(connection=connection, table=table_deny) actual_operations_data += _delete(connection=connection, table=table_deny) snapshot = snapshot_command.wait_for_finished().snapshot # Verify snapshot data is received in exact order as expected. operation_index = 0 for record in snapshot[postgres_cdc_client.instance_name].output: # No need to worry about DDL related CDC records. e.g. table creation etc. if record.get_field_data('/change'): # Since we performed each operation (insert, update and delete) on 3 rows, # each CDC record change contains a list of 3 elements. for i in range(3): if operation_index >= len(expected_operations_data): break expected = expected_operations_data[operation_index] assert expected.kind == record.get_field_data( f'/change[{i}]/kind') assert expected.table == record.get_field_data( f'/change[{i}]/table') # For delete operation there are no columnnames and columnvalues fields. if expected.kind != KIND_FOR_DELETE: assert expected.columnnames == record.get_field_data( f'/change[{i}]/columnnames') assert expected.columnvalues == record.get_field_data( f'/change[{i}]/columnvalues') if expected.kind != KIND_FOR_INSERT: # For update and delete operations verify extra information about old keys. assert expected.oldkeys.keynames == record.get_field_data( f'/change[{i}]/oldkeys/keynames') assert expected.oldkeys.keyvalues == record.get_field_data( f'/change[{i}]/oldkeys/keyvalues') operation_index += 1 finally: if pipeline: sdc_executor.stop_pipeline(pipeline=pipeline, force=True) database.deactivate_and_drop_replication_slot(replication_slot_name) if table_allow is not None: table_allow.drop(database.engine) logger.info('Table: %s dropped.', table_name_allow) if table_deny is not None: table_deny.drop(database.engine) logger.info('Table: %s dropped.', table_name_deny)
def test_hbase_lookup_processor_invalid_url(sdc_builder, sdc_executor, cluster): """HBase Lookup processor test. pipeline will have an invalid url, not errors would be shown dev_raw_data_source >> hbase_lookup >> trash """ # Generate some silly data. bike_races = [ dict(name='Tour de France', first_edition='1903'), dict(name="Giro d'Italia", first_edition='1909'), dict(name='Vuelta a Espana', first_edition='1935') ] # Convert to raw data for the Dev Raw Data Source. raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races) # Generate HBase Lookup's attributes. lookup_parameters = [ dict(rowExpr="${record:value('/text')}", columnExpr='info:empty', outputFieldPath='/founded', timestampExpr='') ] # Get random table name to avoid collisions. table_name = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Source stage. dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data) # Create HBase Lookup processor. hbase_lookup = pipeline_builder.add_stage('HBase Lookup') hbase_lookup.set_attributes(lookup_parameters=lookup_parameters, table_name=table_name) hbase_lookup.zookeeper_quorum = None # Create trash destination. trash = pipeline_builder.add_stage('Trash') # Build pipeline. dev_raw_data_source >> hbase_lookup >> trash pipeline = pipeline_builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) try: logger.info('Creating HBase table %s ...', table_name) cluster.hbase.client.create_table(name=table_name, families={'info': {}}) # Use HappyBase's `Batch` instance to avoid unnecessary calls to HBase. batch = cluster.hbase.client.table(table_name).batch() for bike_race in bike_races: # Use of str.encode() below is because HBase (and HappyBase) speaks in byte arrays. batch.put( bike_race['name'].encode(), {b'info:first_edition': bike_race['first_edition'].encode()}) batch.send() # Run preview. preview = sdc_executor.run_pipeline_preview(pipeline).preview assert preview is not None assert preview.issues.issues_count == 0 finally: # Delete HBase table. logger.info('Deleting HBase table %s ...', table_name) cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_hbase_get_empty_key_to_error(sdc_builder, sdc_executor, cluster): """Check record is sent to error when there is no key in the record and ignore row missing field is set to false dev_raw_data_source >> hbase_lookup >> trash """ data = {'columnField': 'cf1:column'} json_data = json.dumps(data) # Generate HBase Lookup's attributes. lookup_parameters = [ dict(rowExpr="${record:value('/row_key')}", columnExpr="${record:value('/columnField')}", outputFieldPath='/output', timestampExpr='') ] # Get random table name to avoid collisions. table_name = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Source stage. dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.data_format = 'JSON' dev_raw_data_source.raw_data = json_data # Create HBase Lookup processor. hbase_lookup = pipeline_builder.add_stage('HBase Lookup') hbase_lookup.set_attributes(lookup_parameters=lookup_parameters, table_name=table_name, on_record_error='TO_ERROR', ignore_row_missing_field=False) # Create trash destination. trash = pipeline_builder.add_stage('Trash') # Build pipeline. dev_raw_data_source >> hbase_lookup >> trash pipeline = pipeline_builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) try: logger.info('Creating HBase table %s ...', table_name) cluster.hbase.client.create_table(name=table_name, families={'cf1': {}}) # Take a pipeline snapshot. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) scan = cluster.hbase.client.table(table_name).scan() assert 0 == len(list(scan)) stage = snapshot[hbase_lookup.instance_name] logger.info('Error records %s ...', stage.error_records) assert len(stage.error_records) == 1 finally: # Delete HBase table. logger.info('Deleting HBase table %s ...', table_name) cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_rabbitmq_rabbitmq_consumer(sdc_builder, sdc_executor, rabbitmq): """Test for RabbitMQ consumer origin stage. We do so by publishing data to a test queue using RabbitMQ client and having a pipeline which reads that data using RabbitMQ consumer origin stage. Data is then asserted for what is published at RabbitMQ client and what we read in the pipeline snapshot. The pipeline looks like: RabbitMQ Consumer pipeline: rabbitmq_consumer >> trash """ # Build consumer pipeline. name = get_random_string(string.ascii_letters, 10) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') # We set to use default exchange and hence exchange does not need to be pre-created or given. rabbitmq_consumer = builder.add_stage('RabbitMQ Consumer').set_attributes( name=name, data_format='TEXT', durable=True, auto_delete=False, bindings=[]) trash = builder.add_stage('Trash') rabbitmq_consumer >> trash consumer_origin_pipeline = builder.build( title='RabbitMQ Consumer pipeline').configure_for_environment(rabbitmq) sdc_executor.add_pipeline(consumer_origin_pipeline) # Run pipeline and capture snapshot. expected_messages = set() connection = rabbitmq.blocking_connection channel = connection.channel() # About default exchange routing: https://www.rabbitmq.com/tutorials/amqp-concepts.html channel.queue_declare(queue=name, durable=True, exclusive=False, auto_delete=False) channel.confirm_delivery() for i in range(10): expected_message = 'Message {0}'.format(i) expected_messages.add(expected_message) try: channel.basic_publish( exchange="", routing_key=name, # Routing key has to be same as queue name. body=expected_message, properties=pika.BasicProperties(content_type='text/plain', delivery_mode=1), mandatory=True) except: logger.warning('Message %s could not be sent.', expected_message) channel.close() connection.close() # Messages are published, read through the pipeline and assert. snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [ record.field['text'].value for record in snapshot[rabbitmq_consumer.instance_name].output ] assert set(output_records) == expected_messages
def test_jdbc_multitable_consumer_origin_configuration_additional_jdbc_configuration_properties( sdc_builder, sdc_executor, database, postgres_target_server): """Here we are testing disableColumnSanitiser and targetServerType parameter. Setting it to false should convert columns in result set to lower case. targetServerType - Should connect successfully if server is of targetServerType is master. The master/slave distinction is currently done by observing if the server allows writes. If targetServerTypeis slave it should raise error as server we are connecting to allows writes i.e. its of type master. """ src_table_prefix = get_random_string(string.ascii_lowercase, 6) table_name = '{}_{}'.format(src_table_prefix, get_random_string(string.ascii_lowercase, 20)) try: columns = [ Column('id', Integer, primary_key=True), Column('NAME', String(32)) ] properties = [{ 'key': 'disableColumnSanitiser', 'value': 'false' }, { 'key': 'targetServerType', 'value': postgres_target_server }] rows_in_database = [{ 'id': row['id'], 'NAME': row['name'] } for row in ROWS_IN_DATABASE] table = create_table(database, columns, table_name) insert_data_in_table(database, table, rows_in_database) #Build the pipeline attributes = { 'table_configs': [{ "tablePattern": f'%{src_table_prefix}%' }], 'additional_jdbc_configuration_properties': properties } jdbc_multitable_consumer, pipeline = get_jdbc_multitable_consumer_to_trash_pipeline( sdc_builder, database, attributes) #Execute pipeline and check result. sdc_executor.add_pipeline(pipeline) if postgres_target_server == 'slave': with pytest.raises(Exception): sdc_executor.start_pipeline().wait_for_status('FINISHED') else: snapshot = sdc_executor.capture_snapshot( pipeline=pipeline, start_pipeline=True).snapshot # Column names are converted to lower case since database table columns are in upper case. tuples_to_lower_name = lambda tup: (tup[0].lower(), tup[1]) rows_from_snapshot = [ tuples_to_lower_name(list(record.field.items())[1]) for record in snapshot[pipeline[0].instance_name].output ] assert rows_from_snapshot == [('name', row['NAME']) for row in rows_in_database] finally: if sdc_executor.get_pipeline_status(pipeline).response.json().get( 'status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline) delete_table([table], database)
def test_rabbitmq_producer_msg_expiration(sdc_builder, sdc_executor, rabbitmq, set_expiration): """Test expiration time in the messages sent by RabbitMQ Producer. In SDC 3.10.0 the "Set Expiration" option is introduced, which allows SDC users to enable/disable the Expiration Time in the AMQP Message Properties. Prior to that version, users were forced to set an Expiration Time when AMQP Message Properties were actived. This test checks that messages will be expired within the configured milliseconds only when the "Set Expiration" is enabled. Pipeline: dev_raw_data_source >> rabbitmq_producer """ queue_name = get_random_string(string.ascii_letters, 10) exchange_name = get_random_string(string.ascii_letters, 10) input_str = 'Hello World!' expiration_ms = 2000 builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=input_str) rabbitmq_producer = builder.add_stage('RabbitMQ Producer') rabbitmq_producer.set_attributes(name=queue_name, data_format='TEXT', set_amqp_message_properties=True, set_expiration=set_expiration, expiration=expiration_ms, bindings=[ dict(name=exchange_name, type='DIRECT', durable=False, autoDelete=True) ]) dev_raw_data_source >> rabbitmq_producer pipeline = builder.build().configure_for_environment(rabbitmq) sdc_executor.add_pipeline(pipeline) # Set up RabbitMQ client to consume messages sent by SDC connection = rabbitmq.blocking_connection channel = connection.channel() try: # Send a message and consume it within `expiration_ms` milliseconds. sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) sdc_executor.stop_pipeline(pipeline) msg_read = channel.basic_get(queue_name, False)[2].decode().replace('\n', '') assert msg_read == input_str # Send a message, wait `expiration_ms` milliseconds, and consume RabbitMQ queue. If the "Set Expiration" # option is enabled, the queue will be empty and no message will be consumed. sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) sdc_executor.stop_pipeline(pipeline) time.sleep(expiration_ms * 0.001) msg_read = channel.basic_get(queue_name, False)[2] if set_expiration: assert msg_read == None else: assert msg_read.decode().replace('\n', '') == input_str finally: channel.queue_delete(queue_name) channel.close() connection.close()
def test_rabbitmq_rabbitmq_consumer_wrong_format(sdc_builder, sdc_executor, rabbitmq): """Test for RabbitMQ consumer origin stage. We do so by publishing data to a test queue using RabbitMQ client and having a pipeline which reads that data using RabbitMQ consumer origin stage. Data is then asserted for what is published at RabbitMQ client and what we read in the pipeline snapshot. Ten records are treated. The second have wrong format an should be sent to error. The rest ones should be read. The batch size is set up to 1. It makes the connector to fail SDC-14644 The pipeline looks like: RabbitMQ Consumer pipeline: rabbitmq_consumer >> trash """ # Build consumer pipeline. name = get_random_string(string.ascii_letters, 10) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') # We set to use default exchange and hence exchange does not need to be pre-created or given. rabbitmq_consumer = builder.add_stage('RabbitMQ Consumer') rabbitmq_consumer.set_attributes(name=name, data_format='JSON', durable=True, auto_delete=False, on_record_error='TO_ERROR', max_batch_size_in_records=1, bindings=[ dict(name=name, type='DIRECT', durable=True, autoDelete=False) ]) trash = builder.add_stage('Trash') rabbitmq_consumer >> trash consumer_origin_pipeline = builder.build( title='RabbitMQ Consumer pipeline').configure_for_environment(rabbitmq) sdc_executor.add_pipeline(consumer_origin_pipeline) # Create input message and expected message. expected_messages = [{'msg': f'Message {i}'} for i in range(10) if i != 2] input_messages = [json.dumps(msg) for msg in expected_messages] input_messages.insert( 1, '{"msg":') # Â Bad formatted JSON: no closing brace, no value. connection = rabbitmq.blocking_connection channel = connection.channel() # About default exchange routing: https://www.rabbitmq.com/tutorials/amqp-concepts.html channel.queue_declare(queue=name, durable=True, exclusive=False, auto_delete=False) channel.confirm_delivery() for msg in input_messages: try: channel.basic_publish( exchange="", routing_key=name, # Routing key has to be same as queue name. body=msg, properties=pika.BasicProperties(content_type='text/plain', delivery_mode=1), mandatory=True) except: logger.warning('Message %s could not be sent.', msg) channel.close() connection.close() # Messages are published, read through the pipeline and assert. snapshot = sdc_executor.capture_snapshot( consumer_origin_pipeline, start_pipeline=True, batches=10, batch_size=1).wait_for_finished().snapshot # Second message produced an error - the last error in the list error_msg = sdc_executor.get_stage_errors(consumer_origin_pipeline, rabbitmq_consumer)[0].error_code assert error_msg == 'RABBITMQ_04' sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [ record.field for batch in snapshot.snapshot_batches for record in batch.stage_outputs[rabbitmq_consumer.instance_name].output ] # Datacollector does not guarantee the order of the messages, so we sort them. assert sorted(output_records, key=lambda rec: rec['msg'].value) == expected_messages
def test_rabbitmq_producer_target(sdc_builder, sdc_executor, rabbitmq): """Test for RabbitMQ producer target stage. We do so by publishing data to a test queue using RabbitMQ producer stage and then read the data from that queue using RabbitMQ client. We assert the data from the client to what has been injected by the producer pipeline. The pipeline looks like: RabbitMQ Producer pipeline: dev_raw_data_source >> rabbitmq_producer """ # build producer pipeline name = get_random_string(string.ascii_letters, 10) exchange_name = get_random_string(string.ascii_letters, 10) raw_str = 'Hello World!' builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', raw_data=raw_str) rabbitmq_producer = builder.add_stage('RabbitMQ Producer') rabbitmq_producer.set_attributes(name=name, data_format='TEXT', durable=False, auto_delete=True, bindings=[ dict(name=exchange_name, type='DIRECT', durable=False, autoDelete=True) ]) dev_raw_data_source >> rabbitmq_producer producer_dest_pipeline = builder.build( title='RabbitMQ Producer pipeline').configure_for_environment(rabbitmq) producer_dest_pipeline.rate_limit = 1 # add pipeline and capture pipeline messages to assert sdc_executor.add_pipeline(producer_dest_pipeline) sdc_executor.start_pipeline( producer_dest_pipeline).wait_for_pipeline_batch_count(10) sdc_executor.stop_pipeline(producer_dest_pipeline) history = sdc_executor.get_pipeline_history(producer_dest_pipeline) msgs_sent_count = history.latest.metrics.counter( 'pipeline.batchOutputRecords.counter').count logger.debug('Number of messages ingested into the pipeline = %s', msgs_sent_count) # read data from RabbitMQ to assert it is what got ingested into the pipeline connection = rabbitmq.blocking_connection channel = connection.channel() try: # Get one message at a time from RabbitMQ. # Returns a sequence with the method frame, message properties, and body. msgs_received = [ channel.basic_get(name, False)[2].decode().replace('\n', '') for _ in range(msgs_sent_count) ] finally: channel.close() connection.close() logger.debug('Number of messages received from RabbitMQ = %d', (len(msgs_received))) assert msgs_received == [raw_str] * msgs_sent_count
def test_mysql_binary_log_json_column(sdc_builder, sdc_executor, database): """Test that MySQL Binary Log Origin is able to correctly read a json column in a row coming from MySQL Binary Log (AKA CDC). Pipeline looks like: mysql_binary_log >> trash """ table = None connection = None if not database.is_cdc_enabled: pytest.skip('Test only runs against MySQL with CDC enabled.') try: # Create table. connection = database.engine.connect() table_name = get_random_string(string.ascii_lowercase, 20) table = sqlalchemy.Table( table_name, sqlalchemy.MetaData(), sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True, autoincrement=False), sqlalchemy.Column('name', sqlalchemy.String(25)), sqlalchemy.Column('json_column', sqlalchemy.JSON)) table.create(database.engine) # Insert data into table. connection.execute(table.insert(), { 'id': 100, 'name': 'a', 'json_column': { 'a': 123, 'b': 456 } }) # Create Pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() mysql_binary_log = pipeline_builder.add_stage('MySQL Binary Log') mysql_binary_log.set_attributes(start_from_beginning=True, server_id='1', include_tables=database.database + '.' + table_name) trash = pipeline_builder.add_stage('Trash') mysql_binary_log >> trash pipeline = pipeline_builder.build().configure_for_environment(database) sdc_executor.add_pipeline(pipeline) # Run pipeline and verify output. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batches=1).snapshot sdc_executor.stop_pipeline(pipeline) for record in snapshot.snapshot_batches[0][ mysql_binary_log.instance_name].output: assert record.field['Data']['id'] == 100 assert record.field['Data']['name'] == 'a' assert record.field['Data'][ 'json_column'].value == '{"a":123,"b":456}' finally: # Drop table and Connection. if table is not None: logger.info('Dropping table %s in %s database...', table, database.type) table.drop(database.engine) if connection is not None: connection.close()
def test_datalake_destination(sdc_builder, sdc_executor, azure, adls_version): """Test for Data Lake Store target stage. We do so by running a dev raw data source generator to Data Lake Store destination with its provided account FQDN and then reading Data Lake Store using STF client to assert data between the client to what has been ingested by the pipeline. We use a record deduplicator processor in between dev raw data source origin and Data Lake Store destination in order to determine exactly what has been ingested. The pipeline looks like: Data Lake Store Destination pipeline: dev_raw_data_source >> record_deduplicator >> azure_data_lake_store_destination >> to_error """ directory_name = get_random_string(string.ascii_letters, 10) files_prefix = get_random_string(string.ascii_letters, 10) files_suffix = get_random_string(string.ascii_letters, 10) raw_list = [ dict( contact=dict(name='Jane Smith', phone=2124050000, zip_code=27023)), dict(contact=dict(name='San', phone=2120998998, zip_code=14305)) ] raw_data = json.dumps(raw_list) # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', json_content='ARRAY_OBJECTS', raw_data=raw_data) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') azure_data_lake_store_destination = builder.add_stage( name=ADLS_GEN_STAGELIBS[adls_version].target_stagelib) azure_data_lake_store_destination.set_attributes( data_format='JSON', directory_template=(directory_name if adls_version == ADLS_LEGACY else f'/{directory_name}'), files_prefix=files_prefix, files_suffix=files_suffix) dev_raw_data_source >> record_deduplicator >> azure_data_lake_store_destination record_deduplicator >> to_error datalake_dest_pipeline = builder.build().configure_for_environment(azure) sdc_executor.add_pipeline(datalake_dest_pipeline) dl_fs = azure.datalake.file_system try: # start pipeline and capture pipeline messages to assert logger.info( 'Azure Data Lake directory %s will be created with files prefix %s', directory_name, files_prefix) sdc_executor.start_pipeline( datalake_dest_pipeline).wait_for_pipeline_output_records_count(2) sdc_executor.stop_pipeline(datalake_dest_pipeline) dl_files = dl_fs.ls(directory_name) # assert Data Lake files generated assert len(dl_files) == 1 # assert file prefix and suffix dl_file_name = dl_files[0].split('/')[-1] assert dl_file_name.startswith(files_prefix) and dl_file_name.endswith( files_suffix) # Assert file content. File will have len(raw_list) JSON formatted records, delimited by newline (\n). dl_file_contents = dl_fs.cat(dl_files[0]).decode() result_list = [ json.loads(line) for line in dl_file_contents.split('\n') ] assert raw_list == result_list finally: dl_files = dl_fs.ls(directory_name) # Note: Non-empty directory is not allowed to be removed, hence remove all files first. logger.info( 'Azure Data Lake directory %s and underlying files will be deleted.', directory_name) for dl_file in dl_files: dl_fs.rm(dl_file) dl_fs.rmdir(directory_name)
def _test_emr_origin_to_s3(sdc_builder, sdc_executor, aws, pipeline_configs): s3_bucket = aws.emr_s3_bucket_name s3_input_key = '{0}/{1}/input'.format( S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_output_key = '{0}/{1}/output'.format( S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_staging_bucket = aws.emr_s3_staging_bucket_name s3_staging_key = '{0}/{1}/sdc_staging'.format( S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) s3_logging_key = '{0}/{1}/sdc_logging'.format( S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10)) raw_str = 'Hello World!' s3_obj_count = 2 # keep it low, so as the number of MR jobs don't spin a lot and take a while lot of time logger.info( '%s S3 bucket used with input key: %s output key: %s and object count: %s', s3_bucket, s3_input_key, s3_output_key, s3_obj_count) logger.info( '%s S3 staging bucket used with EMR staging key: %s and EMR logging key: %s', s3_staging_bucket, s3_staging_key, s3_logging_key) # build pipeline builder = sdc_builder.get_pipeline_builder() emr_origin = builder.add_stage('Hadoop FS', type='origin') emr_origin.set_attributes( input_paths=[f's3a://{s3_bucket}/{s3_input_key}'], data_format='TEXT') s3_destination = builder.add_stage('Amazon S3', type='destination') s3_destination.set_attributes(bucket=s3_bucket, data_format='TEXT', partition_prefix=s3_output_key) emr_origin >> s3_destination pipeline = builder.build( title='Amazon EMR to S3 pipeline').configure_for_environment(aws) configs = { 'executionMode': 'EMR_BATCH', 'amazonEMRConfig.userRegion': aws.sdc_formatted_region, 'amazonEMRConfig.accessKey': aws.aws_access_key_id, 'amazonEMRConfig.secretKey': aws.aws_secret_access_key, 'amazonEMRConfig.s3StagingUri': f's3://{s3_staging_bucket}/{s3_staging_key}', 'amazonEMRConfig.s3LogUri': f's3://{s3_staging_bucket}/{s3_logging_key}', 'amazonEMRConfig.enableEMRDebugging': False } configs.update(pipeline_configs) pipeline.configuration.update(configs) sdc_executor.add_pipeline(pipeline) client = aws.s3 try: logger.info('Creating input S3 data ...') [ client.put_object(Bucket=s3_staging_bucket, Key='{0}/{1}'.format(s3_input_key, i), Body=raw_str) for i in range(s3_obj_count) ] # lets not wait for pipeline start, as the transition from START to RUNNING takes more time sdc_executor.start_pipeline( pipeline, wait=False).wait_for_finished(timeout_sec=1800) # assert record count to S3 the size of the objects put list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_output_key) assert len(list_s3_objs['Contents']) == s3_obj_count # read data from S3 to assert it is what got ingested into the pipeline s3_contents = [ client.get_object( Bucket=s3_bucket, Key=s3_content['Key'])['Body'].read().decode().strip() for s3_content in list_s3_objs['Contents'] ] assert s3_contents == [raw_str] * s3_obj_count finally: logger.info('Deleting input S3 data ...') delete_keys = { 'Objects': [{ 'Key': k['Key'] } for k in client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_input_key)['Contents']] } client.delete_objects(Bucket=s3_bucket, Delete=delete_keys) logger.info('Deleting output S3 data ...') delete_keys = { 'Objects': [{ 'Key': k['Key'] } for k in client.list_objects_v2( Bucket=s3_bucket, Prefix=s3_output_key)['Contents']] } client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def test_mongodb_origin_simple_with_BSONBinary(sdc_builder, sdc_executor, mongodb): """ Create 3 simple documents consists with BSON Binary data type in MongoDB and confirm that MongoDB origin reads them. The pipeline looks like: mongodb_origin >> trash """ ORIG_BINARY_DOCS = [{ 'data': binary.Binary(b'Binary Data Flute') }, { 'data': binary.Binary(b'Binary Data Oboe') }, { 'data': binary.Binary(b'Binary Data Violin') }] pipeline_builder = sdc_builder.get_pipeline_builder() pipeline_builder.add_error_stage('Discard') mongodb_origin = pipeline_builder.add_stage('MongoDB', type='origin') mongodb_origin.set_attributes(capped_collection=False, database=get_random_string(ascii_letters, 5), collection=get_random_string( ascii_letters, 10)) trash = pipeline_builder.add_stage('Trash') mongodb_origin >> trash pipeline = pipeline_builder.build().configure_for_environment(mongodb) try: # MongoDB and PyMongo add '_id' to the dictionary entries e.g. docs_in_database # when used for inserting in collection. Hence the deep copy. docs_in_database = copy.deepcopy(ORIG_BINARY_DOCS) # Create documents in MongoDB using PyMongo. # First a database is created. Then a collection is created inside that database. # Then documents are created in that collection. logger.info('Adding documents into %s collection using PyMongo...', mongodb_origin.collection) mongodb_database = mongodb.engine[mongodb_origin.database] mongodb_collection = mongodb_database[mongodb_origin.collection] insert_list = [ mongodb_collection.insert_one(doc) for doc in docs_in_database ] assert len(insert_list) == len(docs_in_database) # Start pipeline and verify the documents using snaphot. sdc_executor.add_pipeline(pipeline) snapshot = sdc_executor.capture_snapshot(pipeline=pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [{ 'data': str(record.value2['data']) } for record in snapshot[mongodb_origin].output] assert rows_from_snapshot == [{ 'data': str(record.get('data')) } for record in ORIG_BINARY_DOCS] finally: logger.info('Dropping %s database...', mongodb_origin.database) mongodb.engine.drop_database(mongodb_origin.database)
def test_kinesis_consumer_at_timestamp(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage, with AT_TIMESTAMP option. We do so by: - 1. Publishing data to a test stream - 2. Wait some time and store current timestamp - 3. Publishing new data - 4. Using Kinesis client to attempt reading from stored timestamp, passing it to the AT_TIMESTAMP option - 5. Assert that only the newest data has been read The pipelines look like: Kinesis Consumer pipeline: kinesis_consumer >> trash """ # build stream application_name = get_random_string() stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string()}' client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') # 1. Publish data to the stream put_records = [{ 'Data': f'First Message {i}', 'PartitionKey': '111' } for i in range(10)] client.put_records(Records=put_records, StreamName=stream_name) # 2. Wait and store timestamp time.sleep(10) timestamp = int(time.time()) * 1000 # 3. Publish new data put_records = [{ 'Data': f'Second Message {i}', 'PartitionKey': '111' } for i in range(10)] client.put_records(Records=put_records, StreamName=stream_name) # 4. Build consumer pipeline using timestamp builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='AT_TIMESTAMP', initial_timestamp=timestamp, stream_name=stream_name) trash = builder.add_stage('Trash') kinesis_consumer >> trash consumer_origin_pipeline = builder.build( title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) # 5. messages are published, read through the pipeline and assert snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline, start_pipeline=True, batches=1).snapshot sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [ record.field for record in snapshot[kinesis_consumer.instance_name].output ] assert all('Second' in str(output_record) for output_record in output_records) finally: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream( StreamName=stream_name ) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_schema_generator_types(sdc_builder, input, converter_type, expected_value, sdc_executor): # Test write directory tmp_directory = os.path.join(tempfile.gettempdir(), get_random_string(string.ascii_letters, 10)) # Build pipeline that will generate test record and it's schema builder = sdc_builder.get_pipeline_builder() origin = builder.add_stage('Dev Raw Data Source') origin.data_format = 'JSON' origin.raw_data = json.dumps({"value": input}) origin.stop_after_first_batch = True prefix = origin if converter_type != 'MAP' and converter_type != 'LIST': converter = builder.add_stage('Field Type Converter') converter.conversion_method = 'BY_FIELD' converter.field_type_converter_configs = [{ 'fields': ['/value'], 'targetType': converter_type, 'dataLocale': 'en,US', 'dateFormat': 'YYYY_MM_DD_HH_MM_SS', 'zonedDateTimeFormat': 'ISO_OFFSET_DATE_TIME', 'scale': 2 }] origin >> converter prefix = converter # Generate schema for that record schema_generator = builder.add_stage('Schema Generator') schema_generator.expand_types = True schema_generator.schema_name = 'test_schema' # And store it in local file system local_fs = builder.add_stage('Local FS', type='destination') local_fs.directory_template = tmp_directory local_fs.data_format = 'AVRO' local_fs.configuration[ 'configs.dataGeneratorFormatConfig.avroSchemaSource'] = 'HEADER' # Finish building the pipeline prefix >> schema_generator >> local_fs generator_pipeline = builder.build() # Build second pipeline that will read generated Avro file builder = sdc_builder.get_pipeline_builder() directory = builder.add_stage('Directory', type='origin') directory.data_format = 'AVRO' directory.batch_wait_time_in_secs = 10 directory.file_name_pattern = 'sdc*' directory.files_directory = tmp_directory finisher = builder.add_stage("Pipeline Finisher Executor") finisher.stage_record_preconditions = [ "${record:eventType() == 'no-more-data'}" ] directory >= finisher wiretap = builder.add_wiretap() directory >> wiretap.destination reader_pipeline = builder.build() sdc_executor.add_pipeline(generator_pipeline, reader_pipeline) # Start the pipelines one by one sdc_executor.start_pipeline(generator_pipeline).wait_for_finished() sdc_executor.start_pipeline(reader_pipeline).wait_for_finished() records = wiretap.output_records assert len(records) == 1 assert records[0].field['value'] == expected_value
def test_hbase_empty_key_expression(sdc_builder, sdc_executor, cluster): """Check empty key expression in hbase lookup processor gives a configuration issue dev_raw_data_source >> hbase_lookup >> trash """ # Generate some silly data. bike_races = [ dict(name='Tour de France', first_edition='1903'), dict(name="Giro d'Italia", first_edition='1909'), dict(name='Vuelta a Espana', first_edition='1935') ] # Convert to raw data for the Dev Raw Data Source. raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races) # Generate HBase Lookup's attributes. lookup_parameters = [ dict(rowExpr='', columnExpr='info:first_edition', outputFieldPath='/founded', timestampExpr='') ] # Get random table name to avoid collisions. table_name = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Source stage. dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data) # Create HBase Lookup processor. hbase_lookup = pipeline_builder.add_stage('HBase Lookup') hbase_lookup.set_attributes(lookup_parameters=lookup_parameters, table_name=table_name) # Create trash destination. trash = pipeline_builder.add_stage('Trash') # Build pipeline. dev_raw_data_source >> hbase_lookup >> trash pipeline = pipeline_builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) try: logger.info('Creating HBase table %s ...', table_name) cluster.hbase.client.create_table(name=table_name, families={'info': {}}) issues = sdc_executor.api_client.export_pipeline( pipeline.id)['pipelineConfig']['issues'] assert 0 == issues['issueCount'] # Start pipeline. with pytest.raises(Exception) as e: sdc_executor.start_pipeline(pipeline) sdc_executor.stop_pipeline(pipeline) assert 'HBASE_35' in e.value.message assert 'HBASE_35 - Row key field has empty value' in e.value.message finally: # Delete HBase table. logger.info('Deleting HBase table %s ...', table_name) cluster.hbase.client.delete_table(name=table_name, disable=True)
def topic(): """Topic name used for this specific test.""" topic = get_random_string(string.ascii_letters, 10) logger.debug('Using Topic: %s', topic) return topic
def test_hbase_lookup_processor_invalid_column_family(sdc_builder, sdc_executor, cluster): """HBase Lookup processor test. pipeline will have an invalid column family, HBase_37 error expected () dev_raw_data_source >> hbase_lookup >> trash """ # Generate some silly data. bike_races = [ dict(name='Tour de France', first_edition='1903'), dict(name="Giro d'Italia", first_edition='1909'), dict(name='Vuelta a Espana', first_edition='1935') ] # Convert to raw data for the Dev Raw Data Source. raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races) # Generate HBase Lookup's attributes. lookup_parameters = [ dict(rowExpr="${record:value('/text')}", columnExpr='info:first_edition', outputFieldPath='/founded', timestampExpr=''), dict(rowExpr="${record:value('/text')}", columnExpr='invalid:column', outputFieldPath='/founded', timestampExpr='') ] # Get random table name to avoid collisions. table_name = get_random_string(string.ascii_letters, 10) pipeline_builder = sdc_builder.get_pipeline_builder() # Create Dev Raw Data Source stage. dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes( data_format='TEXT', raw_data=raw_data, ) # Create HBase Lookup processor. hbase_lookup = pipeline_builder.add_stage('HBase Lookup') hbase_lookup.set_attributes(on_record_error='TO_ERROR', lookup_parameters=lookup_parameters, table_name=table_name) # Create trash destination. trash = pipeline_builder.add_stage('Trash') # Build pipeline. dev_raw_data_source >> hbase_lookup >> trash pipeline = pipeline_builder.build().configure_for_environment(cluster) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) try: logger.info('Creating HBase table %s ...', table_name) cluster.hbase.client.create_table(name=table_name, families={'info': {}}) # Start pipeline. with pytest.raises(Exception) as e: sdc_executor.start_pipeline(pipeline) sdc_executor.stop_pipeline(pipeline) assert 'HBASE_36' in e.value.message finally: # Delete HBase table. logger.info('Deleting HBase table %s ...', table_name) cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_elasticsearch_credentials_format(sdc_builder, sdc_executor, elasticsearch, join_credentials): """ Elasticsearch target pipeline where specifies two different formats for the credential values. First, it checks if the previous format "username:password" is also valid and then update the pipeline with the new format, user name and password into two different fields, and checks again. dev_raw_data_source >> es_target """ # Test static es_index = get_random_string( string.ascii_letters, 10).lower() # Elasticsearch indexes must be lower case es_mapping = get_random_string(string.ascii_letters, 10) es_doc_id = get_random_string(string.ascii_letters, 10) raw_str = 'Hello World!' if join_credentials: username = elasticsearch.username + ':' + elasticsearch.password password = '' else: username = elasticsearch.username password = elasticsearch.password # Build pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='TEXT', stop_after_first_batch=True, raw_data=raw_str) es_target = builder.add_stage('Elasticsearch', type='destination') es_target.set_attributes(default_operation='INDEX', document_id=es_doc_id, index=es_index, mapping=es_mapping, use_security=True, user_name=username, password=password) dev_raw_data_source >> es_target es_target_pipeline = builder.build().configure_for_environment( elasticsearch) es_target_pipeline.configuration["shouldRetry"] = False sdc_executor.add_pipeline(es_target_pipeline) try: elasticsearch.client.create_index(es_index) # Run pipeline and read credential values from Elasticsearch to assert sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished() # Since we are upsert on the same index, map, doc - there should only be one document (index 0) response = elasticsearch.client.search(es_index) assert len(response) == 1 assert response[0]['_index'] == es_index assert response[0]['_id'] == es_doc_id assert response[0]['_type'] == es_mapping assert response[0]['_source'] == {'text': raw_str} finally: # Clean up test data in ES elasticsearch.client.delete_index(es_index)
def test_postgres_cdc_client_basic(sdc_builder, sdc_executor, database): """Basic test that inserts/updates/deletes to a Postgres table, and validates that they are read in the same order. Here `Initial Change` config. is at default value = `From the latest change`. With this, the origin processes all changes that occur after pipeline is started. The pipeline looks like: postgres_cdc_client >> trash """ if not database.is_cdc_enabled: pytest.skip('Test only runs against PostgreSQL with CDC enabled.') table_name = get_random_string(string.ascii_lowercase, 20) pipeline_builder = sdc_builder.get_pipeline_builder() postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client') replication_slot_name = get_random_string(string.ascii_lowercase, 10) postgres_cdc_client.set_attributes(remove_replication_slot_on_close=False, replication_slot=replication_slot_name) trash = pipeline_builder.add_stage('Trash') postgres_cdc_client >> trash pipeline = pipeline_builder.build().configure_for_environment(database) sdc_executor.add_pipeline(pipeline) try: # Database operations done after pipeline start will be captured by CDC. # Hence start the pipeline but do not wait for the capture to be finished. snapshot_command = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, wait=False) # Create table and then perform insert, update and delete operations. table = _create_table_in_database(table_name, database) connection = database.engine.connect() expected_operations_data = _insert(connection=connection, table=table) expected_operations_data += _update(connection=connection, table=table) expected_operations_data += _delete(connection=connection, table=table) snapshot = snapshot_command.wait_for_finished().snapshot # Verify snapshot data is received in exact order as expected. operation_index = 0 for record in snapshot[postgres_cdc_client.instance_name].output: # No need to worry about DDL related CDC records. e.g. table creation etc. if record.get_field_data('/change'): # Since we performed each operation (insert, update and delete) on 3 rows, # each CDC record change contains a list of 3 elements. for i in range(3): expected = expected_operations_data[operation_index] assert expected.kind == record.get_field_data( f'/change[{i}]/kind') assert expected.table == record.get_field_data( f'/change[{i}]/table') # For delete operation there are no columnnames and columnvalues fields. if expected.kind != KIND_FOR_DELETE: assert expected.columnnames == record.get_field_data( f'/change[{i}]/columnnames') assert expected.columnvalues == record.get_field_data( f'/change[{i}]/columnvalues') if expected.kind != KIND_FOR_INSERT: # For update and delete operations verify extra information about old keys. assert expected.oldkeys.keynames == record.get_field_data( f'/change[{i}]/oldkeys/keynames') assert expected.oldkeys.keyvalues == record.get_field_data( f'/change[{i}]/oldkeys/keyvalues') operation_index += 1 finally: if pipeline: sdc_executor.stop_pipeline(pipeline=pipeline, force=True) database.deactivate_and_drop_replication_slot(replication_slot_name) if table is not None: table.drop(database.engine) logger.info('Table: %s dropped.', table_name)
def _run_test_s3_error_destination(sdc_builder, sdc_executor, aws, anonymous): try: if anonymous: s3_bucket = create_bucket(aws) logger.info(f'Bucket {s3_bucket} created') else: s3_bucket = aws.s3_bucket_name s3_key = f'{S3_SANDBOX_PREFIX}/errDest-{get_random_string()}/' random_string = get_random_string(string.ascii_letters, 10) random_raw_json_str = f'{{"text":"{random_string}"}}' # Build pipeline. builder = sdc_builder.get_pipeline_builder() s3_err = builder.add_error_stage('Write to Amazon S3') s3_err.set_attributes(bucket=s3_bucket, common_prefix=s3_key) if anonymous: configure_stage_for_anonymous(s3_err) origin = builder.add_stage('Dev Raw Data Source', type='origin') origin.set_attributes(data_format='JSON', raw_data=random_raw_json_str, stop_after_first_batch=True) target = builder.add_stage('To Error', type='destination') origin >> target pipeline = builder.build().configure_for_environment(aws) pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(pipeline) # Now we build and run another pipeline with an S3 Origin to read the data back builder = sdc_builder.get_pipeline_builder() s3_origin = builder.add_stage('Amazon S3', type='origin') s3_origin.set_attributes(bucket=s3_bucket, data_format='SDC_JSON', prefix_pattern=f'{s3_key}*', max_batch_size_in_records=100) if anonymous: configure_stage_for_anonymous(s3_origin) wiretap = builder.add_wiretap() finisher = builder.add_stage('Pipeline Finisher Executor') finisher.set_attributes(stage_record_preconditions=[ "${record:eventType() == 'no-more-data'}" ]) s3_origin >> wiretap.destination s3_origin >= finisher read_pipeline = builder.build().configure_for_environment(aws) read_pipeline.configuration['shouldRetry'] = False sdc_executor.add_pipeline(read_pipeline) client = aws.s3 sdc_executor.start_pipeline(pipeline).wait_for_finished() # We should have exactly one file in the bucket list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key) assert 'Contents' in list_s3_objs # If no object was found, there is no 'Contents' key assert len(list_s3_objs['Contents']) == 1 sdc_executor.start_pipeline(read_pipeline).wait_for_finished() assert len(wiretap.output_records) == 1 assert [record.field['text'] for record in wiretap.output_records][0] == random_string finally: try: aws.delete_s3_data(s3_bucket, s3_key) finally: if anonymous: logger.info(f'Deleting bucket {s3_bucket}') aws.s3.delete_bucket(Bucket=s3_bucket)
def test_postgres_cdc_client_remove_replication_slot(sdc_builder, sdc_executor, database): """ Test the 'Remove replication slot on close' functionality 1. Initialize and start pipeline with specified replication slot 2. Pass some data 3. Stop the pipeline 4. Query postgres database for replication slots, checking removal """ if database.database_server_version < databases.EARLIEST_POSTGRESQL_VERSION_WITH_ACTIVE_PID: # Test only runs against PostgreSQL version with active_pid column in pg_replication_slots. pytest.skip( 'Test only runs against PostgreSQL version >= ' f"{'.'.join(str(item) for item in databases.EARLIEST_POSTGRESQL_VERSION_WITH_ACTIVE_PID)}" ) if not database.is_cdc_enabled: pytest.skip('Test only runs against PostgreSQL with CDC enabled.') table_name = get_random_string(string.ascii_lowercase, 20) replication_slot = get_random_string(string.ascii_lowercase, 10) pipeline_builder = sdc_builder.get_pipeline_builder() postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client') postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True, replication_slot=replication_slot) trash = pipeline_builder.add_stage('Trash') postgres_cdc_client >> trash pipeline = pipeline_builder.build().configure_for_environment(database) sdc_executor.add_pipeline(pipeline) try: # Database operations done after pipeline start will be captured by CDC. # Hence start the pipeline but do not wait for the capture to be finished. snapshot_command = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, wait=False) # Create table and then perform some operations to simulate activity table = _create_table_in_database(table_name, database) connection = database.engine.connect() expected_operations_data = _insert(connection=connection, table=table) expected_operations_data += _update(connection=connection, table=table) expected_operations_data += _delete(connection=connection, table=table) snapshot = snapshot_command.wait_for_finished().snapshot # Timeout is set as without SDC-11252, pipeline will get stuck in 'STOPPING' state forever sdc_executor.stop_pipeline(pipeline=pipeline).wait_for_stopped( timeout_sec=60) # After pipeline stoppage, check on the replication slots remaining listed_slots = connection.execute(CHECK_REP_SLOT_QUERY).fetchall() # Check that replication_slot is not in listed_slots logger.info('Replication slot: ' + replication_slot) logger.info('List of current slots: ' + str(listed_slots)) assert (replication_slot, ) not in listed_slots finally: if table is not None: table.drop(database.engine) logger.info('Table: %s dropped.', table_name)
def test_kinesis_consumer(sdc_builder, sdc_executor, aws): """Test for Kinesis consumer origin stage. We do so by publishing data to a test stream using Kinesis client and having a pipeline which reads that data using Kinesis consumer origin stage. Data is then asserted for what is published at Kinesis client and what we read in the pipeline snapshot. The pipeline looks like: Kinesis Consumer pipeline: kinesis_consumer >> trash """ # build consumer pipeline application_name = get_random_string(string.ascii_letters, 10) stream_name = '{}_{}'.format(aws.kinesis_stream_prefix, get_random_string(string.ascii_letters, 10)) builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') kinesis_consumer = builder.add_stage('Kinesis Consumer') kinesis_consumer.set_attributes(application_name=application_name, data_format='TEXT', initial_position='TRIM_HORIZON', stream_name=stream_name) trash = builder.add_stage('Trash') kinesis_consumer >> trash consumer_origin_pipeline = builder.build( title='Kinesis Consumer pipeline').configure_for_environment(aws) sdc_executor.add_pipeline(consumer_origin_pipeline) # run pipeline and capture snapshot client = aws.kinesis try: logger.info('Creating %s Kinesis stream on AWS ...', stream_name) client.create_stream(StreamName=stream_name, ShardCount=1) aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE') expected_messages = set('Message {0}'.format(i) for i in range(10)) # not using PartitionKey logic and hence assign some temp key put_records = [{ 'Data': exp_msg, 'PartitionKey': '111' } for exp_msg in expected_messages] client.put_records(Records=put_records, StreamName=stream_name) # messages are published, read through the pipeline and assert snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline, start_pipeline=True).snapshot sdc_executor.stop_pipeline(consumer_origin_pipeline) output_records = [ record.field['text'].value for record in snapshot[kinesis_consumer.instance_name].output ] assert set(output_records) == expected_messages finally: logger.info('Deleting %s Kinesis stream on AWS ...', stream_name) client.delete_stream( StreamName=stream_name ) # Stream operations are done. Delete the stream. logger.info('Deleting %s DynamoDB table on AWS ...', application_name) aws.dynamodb.delete_table(TableName=application_name)
def test_ftp_destination(sdc_builder, sdc_executor, ftp): """Smoke test FTP destination. We first create a local file using Local FS destination stage and use that file for FTP destination stage to see if it gets successfully uploaded. The pipelines look like: dev_raw_data_source >> local_fs directory >> sftp_ftp_client """ # Our destination FTP file name ftp_file_name = get_random_string(string.ascii_letters, 10) # Local temporary directory where we will create a source file to be uploaded to FTP server local_tmp_directory = os.path.join(tempfile.gettempdir(), get_random_string(string.ascii_letters, 10)) # Build source file pipeline logic builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage('Dev Raw Data Source') dev_raw_data_source.data_format = 'TEXT' dev_raw_data_source.raw_data = 'Hello World!' dev_raw_data_source.stop_after_first_batch = True local_fs = builder.add_stage('Local FS', type='destination') local_fs.directory_template = local_tmp_directory local_fs.data_format = 'TEXT' dev_raw_data_source >> local_fs local_fs_pipeline = builder.build('Local FS Pipeline') builder = sdc_builder.get_pipeline_builder() # Build FTP destination pipeline logic directory = builder.add_stage('Directory', type='origin') directory.data_format = 'WHOLE_FILE' directory.file_name_pattern = 'sdc*' directory.files_directory = local_tmp_directory sftp_ftp_client = builder.add_stage('SFTP/FTP Client', type='destination') sftp_ftp_client.file_name_expression = ftp_file_name directory >> sftp_ftp_client sftp_ftp_client_pipeline = builder.build('FTP Destination Pipeline').configure_for_environment(ftp) sdc_executor.add_pipeline(local_fs_pipeline, sftp_ftp_client_pipeline) # Start source file creation pipeline and assert file has been created with expected number of records sdc_executor.start_pipeline(local_fs_pipeline).wait_for_finished() history = sdc_executor.get_pipeline_history(local_fs_pipeline) assert history.latest.metrics.counter('pipeline.batchInputRecords.counter').count == 1 assert history.latest.metrics.counter('pipeline.batchOutputRecords.counter').count == 1 # Start FTP upload (destination) file pipeline and assert pipeline has processed expected number of files sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(1) sdc_executor.stop_pipeline(sftp_ftp_client_pipeline) history = sdc_executor.get_pipeline_history(sftp_ftp_client_pipeline) assert history.latest.metrics.counter('pipeline.batchInputRecords.counter').count == 1 assert history.latest.metrics.counter('pipeline.batchOutputRecords.counter').count == 1 # Read FTP destination file and compare our source data to assert assert ftp.get_string(ftp_file_name) == dev_raw_data_source.raw_data # Delete the test FTP destination file we created client = ftp.client try: client.delete(ftp_file_name) finally: client.quit()
def test_firehose_destination_to_s3(sdc_builder, sdc_executor, aws): """Test for Firehose target stage. This test assumes Firehose is destined to S3 bucket. We run a dev raw data source generator to Firehose destination which is pre-setup to put to S3 bucket. We then read S3 bucket using STF client to assert data between the client to what has been ingested into the pipeline. The pipeline looks like: Firehose Destination pipeline: dev_raw_data_source >> record_deduplicator >> firehose_destination >> to_error """ s3_client = aws.s3 firehose_client = aws.firehose # setup test static s3_bucket = aws.s3_bucket_name stream_name = aws.firehose_stream_name # json formatted string random_raw_str = '{{"text":"{0}"}}'.format( get_random_string(string.ascii_letters, 10)) record_count = 1 # random_raw_str record size s3_put_keys = [] # Build the pipeline builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', raw_data=random_raw_str) record_deduplicator = builder.add_stage('Record Deduplicator') to_error = builder.add_stage('To Error') firehose_destination = builder.add_stage('Kinesis Firehose') firehose_destination.set_attributes(stream_name=stream_name, data_format='JSON') dev_raw_data_source >> record_deduplicator >> firehose_destination record_deduplicator >> to_error firehose_dest_pipeline = builder.build( title='Amazon Firehose destination pipeline' ).configure_for_environment(aws) sdc_executor.add_pipeline(firehose_dest_pipeline) try: # start pipeline and assert sdc_executor.start_pipeline( firehose_dest_pipeline).wait_for_pipeline_output_records_count( record_count) sdc_executor.stop_pipeline(firehose_dest_pipeline) # wait till data is available in S3. We do so by querying for buffer wait time and sleep till then resp = firehose_client.describe_delivery_stream( DeliveryStreamName=stream_name) dests = resp['DeliveryStreamDescription']['Destinations'][0] wait_secs = dests['ExtendedS3DestinationDescription'][ 'BufferingHints']['IntervalInSeconds'] time.sleep( wait_secs + 15) # few seconds more to wait to make sure S3 gets the data # Firehose S3 object naming http://docs.aws.amazon.com/firehose/latest/dev/basic-deliver.html#s3-object-name # read data to assert list_s3_objs = s3_client.list_objects_v2( Bucket=s3_bucket, Prefix=datetime.utcnow().strftime("%Y/%m/%d")) for s3_content in list_s3_objs['Contents']: akey = s3_content['Key'] aobj = s3_client.get_object(Bucket=s3_bucket, Key=akey) if aobj['Body'].read().decode().strip() == random_raw_str: s3_put_keys.append(akey) assert len(s3_put_keys) == record_count finally: # delete S3 objects related to this test if len(s3_put_keys) > 0: delete_keys = {'Objects': [{'Key': k} for k in s3_put_keys]} s3_client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def test_hadoop_fs_origin_simple(sdc_builder, sdc_executor, cluster): """Write a simple file into a Hadoop FS folder with a randomly-generated name and confirm that the Hadoop FS origin successfully reads it. Because cluster mode pipelines don't support snapshots, we do this verification using a second standalone pipeline whose origin is an SDC RPC written to by the Hadoop FS pipeline. Specifically, this would look like: Hadoop FS pipeline: hadoop_fs_origin >> sdc_rpc_destination Snapshot pipeline: sdc_rpc_origin >> trash """ hadoop_fs_folder = '/tmp/out/{}'.format(get_random_string(string.ascii_letters, 10)) # Build the Hadoop FS pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') hadoop_fs = builder.add_stage('Hadoop FS', type='origin') hadoop_fs.data_format = 'TEXT' hadoop_fs.input_paths.append(hadoop_fs_folder) sdc_rpc_destination = builder.add_stage('SDC RPC', type='destination') sdc_rpc_destination.sdc_rpc_connection.append('{}:{}'.format(sdc_executor.server_host, SDC_RPC_LISTENING_PORT)) sdc_rpc_destination.sdc_rpc_id = get_random_string(string.ascii_letters, 10) hadoop_fs >> sdc_rpc_destination hadoop_fs_pipeline = builder.build(title='Hadoop FS pipeline').configure_for_environment(cluster) hadoop_fs_pipeline.configuration['executionMode'] = 'CLUSTER_BATCH' # Build the Snapshot pipeline. builder = sdc_builder.get_pipeline_builder() builder.add_error_stage('Discard') sdc_rpc_origin = builder.add_stage('SDC RPC', type='origin') sdc_rpc_origin.sdc_rpc_listening_port = SDC_RPC_LISTENING_PORT sdc_rpc_origin.sdc_rpc_id = sdc_rpc_destination.sdc_rpc_id # Since YARN jobs take a while to get going, set RPC origin batch wait time to 5 min. to avoid # getting an empty batch in the snapshot. sdc_rpc_origin.batch_wait_time_in_secs = 300 trash = builder.add_stage('Trash') sdc_rpc_origin >> trash snapshot_pipeline = builder.build(title='Snapshot pipeline') # Add both pipelines we just created to SDC and start writing files to Hadoop FS with the HDFS client. sdc_executor.add_pipeline(hadoop_fs_pipeline, snapshot_pipeline) try: lines_in_file = ['hello', 'hi', 'how are you?'] logger.debug('Writing file %s/file.txt to Hadoop FS ...', hadoop_fs_folder) cluster.hdfs.client.makedirs(hadoop_fs_folder) cluster.hdfs.client.write(os.path.join(hadoop_fs_folder, 'file.txt'), data='\n'.join(lines_in_file)) # So here's where we do the clever stuff. We use SDC's capture snapshot endpoint to start and begin # capturing a snapshot from the snapshot pipeline. We do this, however, without using the synchronous # wait_for_finished function. That way, we can switch over and start the Hadoop FS pipeline. Once that one # completes, we can go back and do an assert on the snapshot pipeline's snapshot. logger.debug('Starting snapshot pipeline and capturing snapshot ...') snapshot_pipeline_command = sdc_executor.capture_snapshot(snapshot_pipeline, start_pipeline=True, wait=False) logger.debug('Starting Hadoop FS pipeline and waiting for it to finish ...') sdc_executor.start_pipeline(hadoop_fs_pipeline) snapshot = snapshot_pipeline_command.wait_for_finished(timeout_sec=120).snapshot sdc_executor.stop_pipeline(snapshot_pipeline, force=True) lines_from_snapshot = [record.field['text'].value for record in snapshot[snapshot_pipeline[0].instance_name].output] assert lines_from_snapshot == lines_in_file finally: cluster.hdfs.client.delete(hadoop_fs_folder, recursive=True)
def test_file_tale_origin_stop_continue(sdc_builder, sdc_executor): """Test File Tail Origin. We test by making sure files are pre-created using Local FS destination stage pipeline and then have the File Tail Origin read those files. The pipelines looks like: dev_raw_data_source >> local_fs file_tail >> trash """ raw_data = 'Hello!\n' * 10 tmp_directory = os.path.join(tempfile.gettempdir(), get_random_string(string.ascii_letters, 10)) # 1st pipeline which generates the required files for Directory Origin pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True) local_fs = pipeline_builder.add_stage('Local FS', type='destination') local_fs.set_attributes(data_format='TEXT', directory_template=tmp_directory, files_prefix='sdc-', max_records_in_file=100) dev_raw_data_source >> local_fs files_pipeline = pipeline_builder.build('Generate file for start-stop') sdc_executor.add_pipeline(files_pipeline) sdc_executor.start_pipeline(files_pipeline) # 2nd pipeline which reads the files using File Tail stage pipeline_builder = sdc_builder.get_pipeline_builder() file_tail = pipeline_builder.add_stage('File Tail', type='origin') file_tail.set_attributes(data_format='TEXT', file_to_tail=[{ 'fileRollMode': 'ALPHABETICAL', 'fileFullPath': f'{tmp_directory}/*' }]) wiretap_1 = pipeline_builder.add_wiretap() wiretap_2 = pipeline_builder.add_wiretap() file_tail >> wiretap_1.destination file_tail >> wiretap_2.destination file_tail_pipeline = pipeline_builder.build('File Tail Origin pipeline') sdc_executor.add_pipeline(file_tail_pipeline) sdc_executor.start_pipeline(file_tail_pipeline) sdc_executor.wait_for_pipeline_metric(file_tail_pipeline, 'data_batch_count', 1) sdc_executor.stop_pipeline(file_tail_pipeline) # assert all the data captured have the same raw_data # the wiretap output has a dict of {key: Record(s), key: EventRecord} Iterate and assert only Record(s) # by checking a Record having a key called 'text' size_output = 0 for record in wiretap_1.output_records: if 'text' in record.field: assert 'Hello!' == record.field['text'].value size_output += 1 for record in wiretap_2.output_records: if 'text' in record.field: assert 'Hello!' == record.field['text'].value size_output += 1 assert size_output == 10 raw_data = 'Bye!\n' * 10 # 2n pipeline which generates the required files for Directory Origin pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data, stop_after_first_batch=True) local_fs = pipeline_builder.add_stage('Local FS', type='destination') local_fs.set_attributes(data_format='TEXT', directory_template=tmp_directory, files_prefix='sdc-', max_records_in_file=100) dev_raw_data_source >> local_fs files_pipeline_2 = pipeline_builder.build('Generate file for start-stop 2') sdc_executor.add_pipeline(files_pipeline_2) sdc_executor.start_pipeline(files_pipeline_2).wait_for_finished() wiretap_1.reset() wiretap_2.reset() sdc_executor.start_pipeline(file_tail_pipeline) sdc_executor.wait_for_pipeline_metric(file_tail_pipeline, 'data_batch_count', 1) sdc_executor.stop_pipeline(file_tail_pipeline) size_output = 0 for record in wiretap_1.output_records: if 'text' in record.field: assert 'Bye!' == record.field['text'].value size_output += 1 for record in wiretap_2.output_records: if 'text' in record.field: assert 'Bye!' == record.field['text'].value size_output += 1 assert size_output == 10