Example #1
0
def test_datalake_destination_max_records(sdc_builder, sdc_executor, azure,
                                          adls_version):
    """Test for Data Lake Store target stage setting max number of records per file as 1.
       The pipeline looks like:

        Data Lake Store Destination pipeline:
            dev_data_generator >> azure_data_lake_store_destination
    """
    directory_name = get_random_string(string.ascii_letters, 10)
    files_prefix = get_random_string(string.ascii_letters, 10)
    files_suffix = 'json'

    raw_data = [
        dict(id=1, name='abc'),
        dict(id=2, name='def'),
        dict(id=3, name='ghi'),
        dict(id=4, name='jkl'),
        dict(id=5, name='mno'),
        dict(id=6, name='pqr'),
        dict(id=7, name='stu'),
        dict(id=8, name='vwx'),
        dict(id=9, name='y'),
        dict(id=10, name='z')
    ]

    dev_raw_data_source_data = ''.join(json.dumps(d) for d in raw_data)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       raw_data=dev_raw_data_source_data,
                                       stop_after_first_batch=True)

    azure_data_lake_store = pipeline_builder.add_stage(
        name=ADLS_GEN_STAGELIBS[adls_version].target_stagelib)
    azure_data_lake_store.set_attributes(
        data_format='JSON',
        directory_template=directory_name
        if adls_version == ADLS_LEGACY else f'/{directory_name}',
        files_prefix=files_prefix,
        files_suffix=files_suffix,
        max_records_in_file=1)
    dev_raw_data_source >> azure_data_lake_store

    pipeline = pipeline_builder.build().configure_for_environment(azure)
    sdc_executor.add_pipeline(pipeline)
    dl_fs = azure.datalake.file_system

    try:
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        dl_files = dl_fs.ls(directory_name)
        assert len(dl_files) == len(raw_data)
        for dl_file in dl_files:
            dl_file_name = dl_file.split('/')[-1]
            assert dl_file_name.startswith(
                files_prefix) and dl_file_name.endswith(files_suffix)
        dl_file_contents = [
            json.loads(dl_fs.cat(dl_file).decode()) for dl_file in dl_files
        ]

        assert sorted(dl_file_contents,
                      key=itemgetter('id')) == sorted(raw_data,
                                                      key=itemgetter('id'))
    finally:
        dl_files = dl_fs.ls(directory_name)
        logger.info(
            'Azure Data Lake directory %s and underlying files will be deleted.',
            directory_name)
        # Note: Non-empty directory is not allowed to be removed, hence remove all files first.
        for dl_file in dl_files:
            dl_fs.rm(dl_file)
        dl_fs.rmdir(directory_name)
def test_data():
    yield [{
        "text": "Record1",
        "index": get_random_string(string.ascii_letters, 10).lower(),
        "mapping": get_random_string(string.ascii_letters, 10).lower(),
        "doc_id": get_random_string(string.ascii_letters, 10).lower(),
        "shard": get_random_string(string.ascii_letters, 10).lower(),
    }, {
        "text": "Record2",
        "index": get_random_string(string.ascii_letters, 10).lower(),
        "mapping": get_random_string(string.ascii_letters, 10).lower(),
        "doc_id": get_random_string(string.ascii_letters, 10).lower(),
        "shard": get_random_string(string.ascii_letters, 10).lower(),
    }, {
        "text": "Record3",
        "index": get_random_string(string.ascii_letters, 10).lower(),
        "mapping": get_random_string(string.ascii_letters, 10).lower(),
        "doc_id": get_random_string(string.ascii_letters, 10).lower(),
        "shard": get_random_string(string.ascii_letters, 10).lower(),
    }, {
        "text": "Record4",
        "index": get_random_string(string.ascii_letters, 10).lower(),
        "mapping": get_random_string(string.ascii_letters, 10).lower(),
        "doc_id": get_random_string(string.ascii_letters, 10).lower(),
        "shard": None
    }]
Example #3
0
def test_sql_server_cdc_with_cdc_schema_name(sdc_builder, sdc_executor,
                                             database, no_of_threads):
    """Test for SQL Server CDC origin stage when schema change is enabled.
    We do so by capturing Insert Operation on CDC enabled table(s)
    using SQL Server CDC Origin and having a pipeline which reads that data using SQL Server CDC origin stage.
    The records in the pipeline will be stored in SQL Server table using JDBC Producer.
    While the pipeline is running the source table schema is changed by dropping or adding the columns,
    the dest table will be dropping or adding the columns respectively.
    Data is then asserted for what is captured at SQL Server Job and what we read in the pipeline.
    The pipeline looks like:
        sql_server_cdc_origin >> jdbc_producer
    """
    schema_name = DEFAULT_SCHEMA_NAME

    pipeline_builder = sdc_builder.get_pipeline_builder()
    sql_server_cdc = pipeline_builder.add_stage('SQL Server CDC Client')
    sql_server_cdc.set_attributes(
        allow_late_tables=True,
        enable_schema_changes_event=True,
        # when allow_late_tables = true, the pipeline runs one background thread
        # to spool the list of cdc tables
        maximum_pool_size=no_of_threads + 1,
        minimum_idle_connections=no_of_threads + 1,
        new_table_discovery_interval='${1 * SECONDS}',
        number_of_threads=no_of_threads)

    dest_table_name = get_random_string(string.ascii_uppercase, 9)

    dest_table = create_table(database, DEFAULT_SCHEMA_NAME, dest_table_name)
    jdbc_producer = pipeline_builder.add_stage('JDBC Producer')

    jdbc_producer.set_attributes(default_operation='INSERT',
                                 field_to_column_mapping=[],
                                 schema_name=DEFAULT_SCHEMA_NAME,
                                 table_name=dest_table_name)

    sql_server_cdc >> jdbc_producer

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        tables = []
        no_of_records = 5
        rows_in_database = setup_sample_data(no_of_threads * no_of_records)

        for index in range(0, no_of_threads):
            table_name = get_random_string(string.ascii_lowercase, 20)
            # split the rows_in_database into no_of_records for each table
            # e.g. for no_of_records=5, the first table inserts rows_in_database[0:5]
            # and the secord table inserts rows_in_database[5:10]
            table = setup_table(
                database, DEFAULT_SCHEMA_NAME, table_name,
                rows_in_database[(index * no_of_records):((index + 1) *
                                                          no_of_records)])
            tables.append(table)

        # wait for data captured by cdc jobs in sql server before starting the pipeline
        ct_table_name = f'{DEFAULT_SCHEMA_NAME}_{table_name}_CT'
        wait_for_data_in_table(ct_table_name, no_of_records, 'cdc', database)

        sdc_executor.start_pipeline(pipeline)

        wait_for_data_in_table(dest_table_name, no_of_records * no_of_threads,
                               DEFAULT_SCHEMA_NAME, database)

        assert_table_replicated(database, rows_in_database,
                                DEFAULT_SCHEMA_NAME, dest_table_name)

        # add the new column to the last input table
        connection = database.engine.connect()
        logger.info('Adding the column new_column varchar(10) on %s.%s...',
                    schema_name, table_name)
        connection.execute(
            f'ALTER TABLE {table_name} ADD new_column VARCHAR(10)')
        logger.info('Adding the column new_column varchar(10) on %s.%s...',
                    schema_name, dest_table_name)
        connection.execute(
            f'ALTER TABLE {dest_table_name} ADD new_column VARCHAR(10)')

        logger.info('Restarting CDC on table %s', table_name)
        connection.execute(f'EXEC sys.sp_cdc_enable_table '
                           f'@source_schema=N\'{DEFAULT_SCHEMA_NAME}\', '
                           f'@source_name=N\'{table_name}\','
                           f'@role_name = NULL, '
                           f'@capture_instance={schema_name}_{table_name}_2')
        sleep(1)
        logger.info("Enabled _2 CT")
        connection.execute(f'EXEC sys.sp_cdc_disable_table '
                           f'@source_schema=N\'{DEFAULT_SCHEMA_NAME}\', '
                           f'@source_name=N\'{table_name}\','
                           f'@capture_instance={schema_name}_{table_name}')

        sleep(1)
        logger.info("Disabled CT")

        table = sqlalchemy.Table(table_name,
                                 sqlalchemy.MetaData(),
                                 sqlalchemy.Column('id',
                                                   sqlalchemy.Integer,
                                                   primary_key=True,
                                                   autoincrement=False),
                                 sqlalchemy.Column('name',
                                                   sqlalchemy.String(25)),
                                 sqlalchemy.Column('dt',
                                                   sqlalchemy.String(25)),
                                 sqlalchemy.Column('new_column',
                                                   sqlalchemy.String(10)),
                                 schema=schema_name)

        new_sample_data = [
            {
                'id': counter,
                'name': get_random_string(string.ascii_lowercase, 20),
                'dt': '2017-05-05',
                'new_column': get_random_string(string.ascii_lowercase, 10)
            }
            # start with the last counter of rows_in_data to the number of records
            for counter in range(no_of_threads *
                                 no_of_records, (no_of_threads + 1) *
                                 no_of_records)
        ]
        logger.info('Adding %s rows into %s.%s...', len(new_sample_data),
                    schema_name, table_name)
        connection.execute(table.insert(), new_sample_data)

        # adjust sample data by adding new_columns: None and add new sample data to the list
        rows_in_database.extend(new_sample_data)
        # WARNING! the schema change is not captured by JDBC Producer
        for data in rows_in_database:
            data.update(new_column=None)

        ct2_table_name = f'{DEFAULT_SCHEMA_NAME}_{table_name}_2_CT'

        # wait for the completion of the next batch
        wait_for_data_in_table(ct2_table_name, no_of_records, 'cdc', database)

        sdc_executor.stop_pipeline(pipeline)

        assert_table_replicated(database, rows_in_database,
                                DEFAULT_SCHEMA_NAME, dest_table_name)
    finally:
        logger.info('Dropping table %s in %s database...', table,
                    database.type)
        tables.append(dest_table)
        for table in tables:
            logger.info('Dropping table %s in %s database...', table,
                        database.type)
            table.drop(database.engine)
def test_hbase_lookup_processor_get_row(sdc_builder, sdc_executor, cluster):
    """HBase Lookup processor test.
    pipeline will be poroperly configured, will get the expected rows
    dev_raw_data_source >> hbase_lookup >> trash
    """
    # Generate some silly data.
    bike_races = [
        dict(name='Tour de France', first_edition='1903'),
        dict(name='Giro d Italia', first_edition='1909'),
        dict(name='Vuelta a Espana', first_edition='1935')
    ]

    expected = [(b'Giro d Italia', {
        b'info:first_edition': b'1909'
    }), (b'Tour de France', {
        b'info:first_edition': b'1903'
    }), (b'Vuelta a Espana', {
        b'info:first_edition': b'1935'
    })]

    # Convert to raw data for the Dev Raw Data Source.
    raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races)

    # Generate HBase Lookup's attributes.
    lookup_parameters = [
        dict(rowExpr="${record:value('/text')}",
             columnExpr='info:first_edition',
             outputFieldPath='/founded',
             timestampExpr='')
    ]

    # Get random table name to avoid collisions.
    table_name = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Source stage.
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data)

    # Create HBase Lookup processor.
    hbase_lookup = pipeline_builder.add_stage('HBase Lookup')
    hbase_lookup.set_attributes(lookup_parameters=lookup_parameters,
                                table_name=table_name)

    # Create trash destination.
    trash = pipeline_builder.add_stage('Trash')

    # Build pipeline.
    dev_raw_data_source >> hbase_lookup >> trash
    pipeline = pipeline_builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating HBase table %s ...', table_name)
        cluster.hbase.client.create_table(name=table_name,
                                          families={'info': {}})

        # Use HappyBase's `Batch` instance to avoid unnecessary calls to HBase.
        batch = cluster.hbase.client.table(table_name).batch()
        for bike_race in bike_races:
            # Use of str.encode() below is because HBase (and HappyBase) speaks in byte arrays.
            batch.put(
                bike_race['name'].encode(),
                {b'info:first_edition': bike_race['first_edition'].encode()})
        batch.send()

        # Take a pipeline snapshot.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)

        # Validate output.
        assert [
            dict(name=record.value2['text'],
                 first_edition=record.value2['founded'])
            for record in snapshot[hbase_lookup.instance_name].output
        ] == bike_races

        # Validate output.
        result_list = list(cluster.hbase.client.table(table_name).scan())
        assert result_list == expected

    finally:
        # Delete HBase table.
        logger.info('Deleting HBase table %s ...', table_name)
        cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_postgres_cdc_client_filtering_table(sdc_builder, sdc_executor,
                                             database):
    """
        Test filtering for inserts/updates/deletes to a Postgres table

        1. Random table names for "table_allow", "table_deny"
        2. Filter OUT anything for "table_deny"
        3. Insert/update/delete for both tables
        4. Should see updates for "table_allow" only

        The pipeline looks like:
        postgres_cdc_client >> trash
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name_allow = get_random_string(string.ascii_lowercase, 20)
    table_name_deny = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)

    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=False,
                                       replication_slot=replication_slot_name,
                                       schema_table_configs=[{
                                           'schema': 'public'
                                       }, {
                                           'exclude_pattern':
                                           table_name_deny
                                       }, {
                                           'table':
                                           table_name_allow
                                       }])
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline,
                                                         start_pipeline=True,
                                                         wait=False)

        # Create table and then perform insert, update and delete operations.
        table_allow = _create_table_in_database(table_name_allow, database)
        table_deny = _create_table_in_database(table_name_deny, database)
        connection = database.engine.connect()

        expected_operations_data = _insert(connection=connection,
                                           table=table_allow)
        expected_operations_data += _update(connection=connection,
                                            table=table_allow)
        expected_operations_data += _delete(connection=connection,
                                            table=table_allow)

        actual_operations_data = expected_operations_data.copy()

        actual_operations_data += _insert(connection=connection,
                                          table=table_deny)
        actual_operations_data += _update(connection=connection,
                                          table=table_deny)
        actual_operations_data += _delete(connection=connection,
                                          table=table_deny)

        snapshot = snapshot_command.wait_for_finished().snapshot

        # Verify snapshot data is received in exact order as expected.
        operation_index = 0

        for record in snapshot[postgres_cdc_client.instance_name].output:
            # No need to worry about DDL related CDC records. e.g. table creation etc.
            if record.get_field_data('/change'):
                # Since we performed each operation (insert, update and delete) on 3 rows,
                # each CDC  record change contains a list of 3 elements.
                for i in range(3):
                    if operation_index >= len(expected_operations_data):
                        break
                    expected = expected_operations_data[operation_index]
                    assert expected.kind == record.get_field_data(
                        f'/change[{i}]/kind')
                    assert expected.table == record.get_field_data(
                        f'/change[{i}]/table')
                    # For delete operation there are no columnnames and columnvalues fields.
                    if expected.kind != KIND_FOR_DELETE:
                        assert expected.columnnames == record.get_field_data(
                            f'/change[{i}]/columnnames')
                        assert expected.columnvalues == record.get_field_data(
                            f'/change[{i}]/columnvalues')
                    if expected.kind != KIND_FOR_INSERT:
                        # For update and delete operations verify extra information about old keys.
                        assert expected.oldkeys.keynames == record.get_field_data(
                            f'/change[{i}]/oldkeys/keynames')
                        assert expected.oldkeys.keyvalues == record.get_field_data(
                            f'/change[{i}]/oldkeys/keyvalues')
                    operation_index += 1

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table_allow is not None:
            table_allow.drop(database.engine)
            logger.info('Table: %s dropped.', table_name_allow)
        if table_deny is not None:
            table_deny.drop(database.engine)
            logger.info('Table: %s dropped.', table_name_deny)
def test_hbase_lookup_processor_invalid_url(sdc_builder, sdc_executor,
                                            cluster):
    """HBase Lookup processor test.
    pipeline will have an invalid url, not errors would be shown
    dev_raw_data_source >> hbase_lookup >> trash
    """
    # Generate some silly data.
    bike_races = [
        dict(name='Tour de France', first_edition='1903'),
        dict(name="Giro d'Italia", first_edition='1909'),
        dict(name='Vuelta a Espana', first_edition='1935')
    ]

    # Convert to raw data for the Dev Raw Data Source.
    raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races)

    # Generate HBase Lookup's attributes.
    lookup_parameters = [
        dict(rowExpr="${record:value('/text')}",
             columnExpr='info:empty',
             outputFieldPath='/founded',
             timestampExpr='')
    ]

    # Get random table name to avoid collisions.
    table_name = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Source stage.
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data)

    # Create HBase Lookup processor.
    hbase_lookup = pipeline_builder.add_stage('HBase Lookup')
    hbase_lookup.set_attributes(lookup_parameters=lookup_parameters,
                                table_name=table_name)
    hbase_lookup.zookeeper_quorum = None

    # Create trash destination.
    trash = pipeline_builder.add_stage('Trash')

    # Build pipeline.
    dev_raw_data_source >> hbase_lookup >> trash
    pipeline = pipeline_builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating HBase table %s ...', table_name)
        cluster.hbase.client.create_table(name=table_name,
                                          families={'info': {}})

        # Use HappyBase's `Batch` instance to avoid unnecessary calls to HBase.
        batch = cluster.hbase.client.table(table_name).batch()
        for bike_race in bike_races:
            # Use of str.encode() below is because HBase (and HappyBase) speaks in byte arrays.
            batch.put(
                bike_race['name'].encode(),
                {b'info:first_edition': bike_race['first_edition'].encode()})
        batch.send()

        # Run preview.
        preview = sdc_executor.run_pipeline_preview(pipeline).preview
        assert preview is not None

        assert preview.issues.issues_count == 0

    finally:
        # Delete HBase table.
        logger.info('Deleting HBase table %s ...', table_name)
        cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_hbase_get_empty_key_to_error(sdc_builder, sdc_executor, cluster):
    """Check record is sent to error when there is no key in the record and ignore row missing field is set to false
    dev_raw_data_source >> hbase_lookup >> trash
    """

    data = {'columnField': 'cf1:column'}
    json_data = json.dumps(data)

    # Generate HBase Lookup's attributes.
    lookup_parameters = [
        dict(rowExpr="${record:value('/row_key')}",
             columnExpr="${record:value('/columnField')}",
             outputFieldPath='/output',
             timestampExpr='')
    ]

    # Get random table name to avoid collisions.
    table_name = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Source stage.
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.data_format = 'JSON'
    dev_raw_data_source.raw_data = json_data

    # Create HBase Lookup processor.
    hbase_lookup = pipeline_builder.add_stage('HBase Lookup')
    hbase_lookup.set_attributes(lookup_parameters=lookup_parameters,
                                table_name=table_name,
                                on_record_error='TO_ERROR',
                                ignore_row_missing_field=False)

    # Create trash destination.
    trash = pipeline_builder.add_stage('Trash')

    # Build pipeline.
    dev_raw_data_source >> hbase_lookup >> trash
    pipeline = pipeline_builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating HBase table %s ...', table_name)
        cluster.hbase.client.create_table(name=table_name,
                                          families={'cf1': {}})

        # Take a pipeline snapshot.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)

        scan = cluster.hbase.client.table(table_name).scan()

        assert 0 == len(list(scan))

        stage = snapshot[hbase_lookup.instance_name]
        logger.info('Error records %s ...', stage.error_records)

        assert len(stage.error_records) == 1

    finally:
        # Delete HBase table.
        logger.info('Deleting HBase table %s ...', table_name)
        cluster.hbase.client.delete_table(name=table_name, disable=True)
Example #8
0
def test_rabbitmq_rabbitmq_consumer(sdc_builder, sdc_executor, rabbitmq):
    """Test for RabbitMQ consumer origin stage. We do so by publishing data to a test queue using RabbitMQ client and
    having a pipeline which reads that data using RabbitMQ consumer origin stage. Data is then asserted for what is
    published at RabbitMQ client and what we read in the pipeline snapshot. The pipeline looks like:

    RabbitMQ Consumer pipeline:
        rabbitmq_consumer >> trash
    """
    # Build consumer pipeline.
    name = get_random_string(string.ascii_letters, 10)

    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    # We set to use default exchange and hence exchange does not need to be pre-created or given.
    rabbitmq_consumer = builder.add_stage('RabbitMQ Consumer').set_attributes(
        name=name,
        data_format='TEXT',
        durable=True,
        auto_delete=False,
        bindings=[])
    trash = builder.add_stage('Trash')

    rabbitmq_consumer >> trash

    consumer_origin_pipeline = builder.build(
        title='RabbitMQ Consumer pipeline').configure_for_environment(rabbitmq)
    sdc_executor.add_pipeline(consumer_origin_pipeline)

    # Run pipeline and capture snapshot.
    expected_messages = set()
    connection = rabbitmq.blocking_connection
    channel = connection.channel()

    # About default exchange routing: https://www.rabbitmq.com/tutorials/amqp-concepts.html
    channel.queue_declare(queue=name,
                          durable=True,
                          exclusive=False,
                          auto_delete=False)
    channel.confirm_delivery()
    for i in range(10):
        expected_message = 'Message {0}'.format(i)
        expected_messages.add(expected_message)
        try:
            channel.basic_publish(
                exchange="",
                routing_key=name,  # Routing key has to be same as queue name.
                body=expected_message,
                properties=pika.BasicProperties(content_type='text/plain',
                                                delivery_mode=1),
                mandatory=True)
        except:
            logger.warning('Message %s could not be sent.', expected_message)

    channel.close()
    connection.close()

    # Messages are published, read through the pipeline and assert.
    snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline,
                                             start_pipeline=True).snapshot
    sdc_executor.stop_pipeline(consumer_origin_pipeline)
    output_records = [
        record.field['text'].value
        for record in snapshot[rabbitmq_consumer.instance_name].output
    ]

    assert set(output_records) == expected_messages
Example #9
0
def test_jdbc_multitable_consumer_origin_configuration_additional_jdbc_configuration_properties(
        sdc_builder, sdc_executor, database, postgres_target_server):
    """Here we are testing disableColumnSanitiser and targetServerType parameter. Setting it to false should convert
    columns in result set to lower case. targetServerType - Should connect successfully if server is of
    targetServerType is master. The master/slave distinction is currently done by observing if the server allows
    writes. If targetServerTypeis slave it should raise error as server we are connecting to allows writes
    i.e. its of type master.
    """
    src_table_prefix = get_random_string(string.ascii_lowercase, 6)
    table_name = '{}_{}'.format(src_table_prefix,
                                get_random_string(string.ascii_lowercase, 20))

    try:
        columns = [
            Column('id', Integer, primary_key=True),
            Column('NAME', String(32))
        ]
        properties = [{
            'key': 'disableColumnSanitiser',
            'value': 'false'
        }, {
            'key': 'targetServerType',
            'value': postgres_target_server
        }]
        rows_in_database = [{
            'id': row['id'],
            'NAME': row['name']
        } for row in ROWS_IN_DATABASE]
        table = create_table(database, columns, table_name)
        insert_data_in_table(database, table, rows_in_database)

        #Build the pipeline
        attributes = {
            'table_configs': [{
                "tablePattern": f'%{src_table_prefix}%'
            }],
            'additional_jdbc_configuration_properties': properties
        }
        jdbc_multitable_consumer, pipeline = get_jdbc_multitable_consumer_to_trash_pipeline(
            sdc_builder, database, attributes)

        #Execute pipeline and check result.
        sdc_executor.add_pipeline(pipeline)
        if postgres_target_server == 'slave':
            with pytest.raises(Exception):
                sdc_executor.start_pipeline().wait_for_status('FINISHED')
        else:
            snapshot = sdc_executor.capture_snapshot(
                pipeline=pipeline, start_pipeline=True).snapshot

            # Column names are converted to lower case since database table columns are in upper case.
            tuples_to_lower_name = lambda tup: (tup[0].lower(), tup[1])
            rows_from_snapshot = [
                tuples_to_lower_name(list(record.field.items())[1])
                for record in snapshot[pipeline[0].instance_name].output
            ]
            assert rows_from_snapshot == [('name', row['NAME'])
                                          for row in rows_in_database]
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        delete_table([table], database)
Example #10
0
def test_rabbitmq_producer_msg_expiration(sdc_builder, sdc_executor, rabbitmq,
                                          set_expiration):
    """Test expiration time in the messages sent by RabbitMQ Producer.

    In SDC 3.10.0 the "Set Expiration" option is introduced, which allows SDC users to enable/disable the
    Expiration Time in the AMQP Message Properties. Prior to that version, users were forced to set an Expiration
    Time when AMQP Message Properties were actived. This test checks that messages will be expired within the
    configured milliseconds only when the "Set Expiration" is enabled.

    Pipeline:
        dev_raw_data_source >> rabbitmq_producer

    """
    queue_name = get_random_string(string.ascii_letters, 10)
    exchange_name = get_random_string(string.ascii_letters, 10)
    input_str = 'Hello World!'
    expiration_ms = 2000
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=input_str)

    rabbitmq_producer = builder.add_stage('RabbitMQ Producer')
    rabbitmq_producer.set_attributes(name=queue_name,
                                     data_format='TEXT',
                                     set_amqp_message_properties=True,
                                     set_expiration=set_expiration,
                                     expiration=expiration_ms,
                                     bindings=[
                                         dict(name=exchange_name,
                                              type='DIRECT',
                                              durable=False,
                                              autoDelete=True)
                                     ])

    dev_raw_data_source >> rabbitmq_producer
    pipeline = builder.build().configure_for_environment(rabbitmq)
    sdc_executor.add_pipeline(pipeline)

    # Set up RabbitMQ client to consume messages sent by SDC
    connection = rabbitmq.blocking_connection
    channel = connection.channel()

    try:
        # Send a message and consume it within `expiration_ms` milliseconds.
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)
        sdc_executor.stop_pipeline(pipeline)

        msg_read = channel.basic_get(queue_name,
                                     False)[2].decode().replace('\n', '')
        assert msg_read == input_str

        # Send a message, wait `expiration_ms` milliseconds, and consume RabbitMQ queue. If the "Set Expiration"
        # option is enabled, the queue will be empty and no message will be consumed.
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)
        sdc_executor.stop_pipeline(pipeline)

        time.sleep(expiration_ms * 0.001)
        msg_read = channel.basic_get(queue_name, False)[2]
        if set_expiration:
            assert msg_read == None
        else:
            assert msg_read.decode().replace('\n', '') == input_str

    finally:
        channel.queue_delete(queue_name)
        channel.close()
        connection.close()
Example #11
0
def test_rabbitmq_rabbitmq_consumer_wrong_format(sdc_builder, sdc_executor,
                                                 rabbitmq):
    """Test for RabbitMQ consumer origin stage. We do so by publishing data to a test queue using RabbitMQ client and
    having a pipeline which reads that data using RabbitMQ consumer origin stage. Data is then asserted for what is
    published at RabbitMQ client and what we read in the pipeline snapshot.
    Ten records are treated. The second have wrong format an should be sent to error. The rest ones should be read.
    The batch size is set up to 1. It makes the connector to fail SDC-14644
    The pipeline looks like:

    RabbitMQ Consumer pipeline:
        rabbitmq_consumer >> trash
    """
    # Build consumer pipeline.
    name = get_random_string(string.ascii_letters, 10)

    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    # We set to use default exchange and hence exchange does not need to be pre-created or given.
    rabbitmq_consumer = builder.add_stage('RabbitMQ Consumer')
    rabbitmq_consumer.set_attributes(name=name,
                                     data_format='JSON',
                                     durable=True,
                                     auto_delete=False,
                                     on_record_error='TO_ERROR',
                                     max_batch_size_in_records=1,
                                     bindings=[
                                         dict(name=name,
                                              type='DIRECT',
                                              durable=True,
                                              autoDelete=False)
                                     ])
    trash = builder.add_stage('Trash')

    rabbitmq_consumer >> trash

    consumer_origin_pipeline = builder.build(
        title='RabbitMQ Consumer pipeline').configure_for_environment(rabbitmq)
    sdc_executor.add_pipeline(consumer_origin_pipeline)

    # Create input message and expected message.
    expected_messages = [{'msg': f'Message {i}'} for i in range(10) if i != 2]
    input_messages = [json.dumps(msg) for msg in expected_messages]
    input_messages.insert(
        1, '{"msg":')  #  Bad formatted JSON: no closing brace, no value.

    connection = rabbitmq.blocking_connection
    channel = connection.channel()

    # About default exchange routing: https://www.rabbitmq.com/tutorials/amqp-concepts.html
    channel.queue_declare(queue=name,
                          durable=True,
                          exclusive=False,
                          auto_delete=False)
    channel.confirm_delivery()
    for msg in input_messages:
        try:
            channel.basic_publish(
                exchange="",
                routing_key=name,  # Routing key has to be same as queue name.
                body=msg,
                properties=pika.BasicProperties(content_type='text/plain',
                                                delivery_mode=1),
                mandatory=True)
        except:
            logger.warning('Message %s could not be sent.', msg)

    channel.close()
    connection.close()

    # Messages are published, read through the pipeline and assert.
    snapshot = sdc_executor.capture_snapshot(
        consumer_origin_pipeline,
        start_pipeline=True,
        batches=10,
        batch_size=1).wait_for_finished().snapshot

    # Second message produced an error - the last error in the list

    error_msg = sdc_executor.get_stage_errors(consumer_origin_pipeline,
                                              rabbitmq_consumer)[0].error_code
    assert error_msg == 'RABBITMQ_04'

    sdc_executor.stop_pipeline(consumer_origin_pipeline)
    output_records = [
        record.field for batch in snapshot.snapshot_batches for record in
        batch.stage_outputs[rabbitmq_consumer.instance_name].output
    ]

    # Datacollector does not guarantee the order of the messages, so we sort them.
    assert sorted(output_records,
                  key=lambda rec: rec['msg'].value) == expected_messages
Example #12
0
def test_rabbitmq_producer_target(sdc_builder, sdc_executor, rabbitmq):
    """Test for RabbitMQ producer target stage. We do so by publishing data to a test queue using RabbitMQ producer
    stage and then read the data from that queue using RabbitMQ client. We assert the data from the client to what has
    been injected by the producer pipeline. The pipeline looks like:

    RabbitMQ Producer pipeline:
        dev_raw_data_source >> rabbitmq_producer
    """
    # build producer pipeline
    name = get_random_string(string.ascii_letters, 10)
    exchange_name = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              raw_data=raw_str)

    rabbitmq_producer = builder.add_stage('RabbitMQ Producer')
    rabbitmq_producer.set_attributes(name=name,
                                     data_format='TEXT',
                                     durable=False,
                                     auto_delete=True,
                                     bindings=[
                                         dict(name=exchange_name,
                                              type='DIRECT',
                                              durable=False,
                                              autoDelete=True)
                                     ])

    dev_raw_data_source >> rabbitmq_producer
    producer_dest_pipeline = builder.build(
        title='RabbitMQ Producer pipeline').configure_for_environment(rabbitmq)
    producer_dest_pipeline.rate_limit = 1

    # add pipeline and capture pipeline messages to assert
    sdc_executor.add_pipeline(producer_dest_pipeline)
    sdc_executor.start_pipeline(
        producer_dest_pipeline).wait_for_pipeline_batch_count(10)
    sdc_executor.stop_pipeline(producer_dest_pipeline)

    history = sdc_executor.get_pipeline_history(producer_dest_pipeline)
    msgs_sent_count = history.latest.metrics.counter(
        'pipeline.batchOutputRecords.counter').count
    logger.debug('Number of messages ingested into the pipeline = %s',
                 msgs_sent_count)

    # read data from RabbitMQ to assert it is what got ingested into the pipeline
    connection = rabbitmq.blocking_connection
    channel = connection.channel()
    try:
        # Get one message at a time from RabbitMQ.
        # Returns a sequence with the method frame, message properties, and body.
        msgs_received = [
            channel.basic_get(name, False)[2].decode().replace('\n', '')
            for _ in range(msgs_sent_count)
        ]
    finally:
        channel.close()
        connection.close()

    logger.debug('Number of messages received from RabbitMQ = %d',
                 (len(msgs_received)))

    assert msgs_received == [raw_str] * msgs_sent_count
def test_mysql_binary_log_json_column(sdc_builder, sdc_executor, database):
    """Test that MySQL Binary Log Origin is able to correctly read a json column in a row coming from MySQL Binary Log
    (AKA CDC).

    Pipeline looks like:

        mysql_binary_log >> trash
    """
    table = None
    connection = None

    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against MySQL with CDC enabled.')

    try:
        # Create table.
        connection = database.engine.connect()
        table_name = get_random_string(string.ascii_lowercase, 20)
        table = sqlalchemy.Table(
            table_name, sqlalchemy.MetaData(),
            sqlalchemy.Column('id',
                              sqlalchemy.Integer,
                              primary_key=True,
                              autoincrement=False),
            sqlalchemy.Column('name', sqlalchemy.String(25)),
            sqlalchemy.Column('json_column', sqlalchemy.JSON))
        table.create(database.engine)

        # Insert data into table.
        connection.execute(table.insert(), {
            'id': 100,
            'name': 'a',
            'json_column': {
                'a': 123,
                'b': 456
            }
        })

        # Create Pipeline.
        pipeline_builder = sdc_builder.get_pipeline_builder()
        mysql_binary_log = pipeline_builder.add_stage('MySQL Binary Log')
        mysql_binary_log.set_attributes(start_from_beginning=True,
                                        server_id='1',
                                        include_tables=database.database +
                                        '.' + table_name)
        trash = pipeline_builder.add_stage('Trash')

        mysql_binary_log >> trash

        pipeline = pipeline_builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        # Run pipeline and verify output.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True,
                                                 batches=1).snapshot
        sdc_executor.stop_pipeline(pipeline)

        for record in snapshot.snapshot_batches[0][
                mysql_binary_log.instance_name].output:
            assert record.field['Data']['id'] == 100
            assert record.field['Data']['name'] == 'a'
            assert record.field['Data'][
                'json_column'].value == '{"a":123,"b":456}'

    finally:
        # Drop table and Connection.
        if table is not None:
            logger.info('Dropping table %s in %s database...', table,
                        database.type)
            table.drop(database.engine)

        if connection is not None:
            connection.close()
Example #14
0
def test_datalake_destination(sdc_builder, sdc_executor, azure, adls_version):
    """Test for Data Lake Store target stage. We do so by running a dev raw data source generator to Data Lake Store
    destination with its provided account FQDN and then reading Data Lake Store using STF client to assert data
    between the client to what has been ingested by the pipeline. We use a record deduplicator processor in
    between dev raw data source origin and Data Lake Store destination in order to determine exactly what has
    been ingested. The pipeline looks like:

    Data Lake Store Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> azure_data_lake_store_destination
                                                   >> to_error
    """
    directory_name = get_random_string(string.ascii_letters, 10)
    files_prefix = get_random_string(string.ascii_letters, 10)
    files_suffix = get_random_string(string.ascii_letters, 10)
    raw_list = [
        dict(
            contact=dict(name='Jane Smith', phone=2124050000, zip_code=27023)),
        dict(contact=dict(name='San', phone=2120998998, zip_code=14305))
    ]
    raw_data = json.dumps(raw_list)

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       json_content='ARRAY_OBJECTS',
                                       raw_data=raw_data)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    azure_data_lake_store_destination = builder.add_stage(
        name=ADLS_GEN_STAGELIBS[adls_version].target_stagelib)
    azure_data_lake_store_destination.set_attributes(
        data_format='JSON',
        directory_template=(directory_name if adls_version == ADLS_LEGACY else
                            f'/{directory_name}'),
        files_prefix=files_prefix,
        files_suffix=files_suffix)

    dev_raw_data_source >> record_deduplicator >> azure_data_lake_store_destination
    record_deduplicator >> to_error

    datalake_dest_pipeline = builder.build().configure_for_environment(azure)
    sdc_executor.add_pipeline(datalake_dest_pipeline)
    dl_fs = azure.datalake.file_system
    try:
        # start pipeline and capture pipeline messages to assert
        logger.info(
            'Azure Data Lake directory %s will be created with files prefix %s',
            directory_name, files_prefix)
        sdc_executor.start_pipeline(
            datalake_dest_pipeline).wait_for_pipeline_output_records_count(2)
        sdc_executor.stop_pipeline(datalake_dest_pipeline)

        dl_files = dl_fs.ls(directory_name)

        # assert Data Lake files generated
        assert len(dl_files) == 1

        # assert file prefix and suffix
        dl_file_name = dl_files[0].split('/')[-1]
        assert dl_file_name.startswith(files_prefix) and dl_file_name.endswith(
            files_suffix)

        # Assert file content. File will have len(raw_list) JSON formatted records, delimited by newline (\n).
        dl_file_contents = dl_fs.cat(dl_files[0]).decode()
        result_list = [
            json.loads(line) for line in dl_file_contents.split('\n')
        ]

        assert raw_list == result_list
    finally:
        dl_files = dl_fs.ls(directory_name)
        # Note: Non-empty directory is not allowed to be removed, hence remove all files first.
        logger.info(
            'Azure Data Lake directory %s and underlying files will be deleted.',
            directory_name)
        for dl_file in dl_files:
            dl_fs.rm(dl_file)
        dl_fs.rmdir(directory_name)
def _test_emr_origin_to_s3(sdc_builder, sdc_executor, aws, pipeline_configs):
    s3_bucket = aws.emr_s3_bucket_name
    s3_input_key = '{0}/{1}/input'.format(
        S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))
    s3_output_key = '{0}/{1}/output'.format(
        S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))

    s3_staging_bucket = aws.emr_s3_staging_bucket_name
    s3_staging_key = '{0}/{1}/sdc_staging'.format(
        S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))
    s3_logging_key = '{0}/{1}/sdc_logging'.format(
        S3_SANDBOX_PREFIX, get_random_string(string.ascii_letters, 10))

    raw_str = 'Hello World!'
    s3_obj_count = 2  # keep it low, so as the number of MR jobs don't spin a lot and take a while lot of time

    logger.info(
        '%s S3 bucket used with input key: %s output key: %s and object count: %s',
        s3_bucket, s3_input_key, s3_output_key, s3_obj_count)
    logger.info(
        '%s S3 staging bucket used with EMR staging key: %s and EMR logging key: %s',
        s3_staging_bucket, s3_staging_key, s3_logging_key)

    # build pipeline
    builder = sdc_builder.get_pipeline_builder()

    emr_origin = builder.add_stage('Hadoop FS', type='origin')
    emr_origin.set_attributes(
        input_paths=[f's3a://{s3_bucket}/{s3_input_key}'], data_format='TEXT')

    s3_destination = builder.add_stage('Amazon S3', type='destination')
    s3_destination.set_attributes(bucket=s3_bucket,
                                  data_format='TEXT',
                                  partition_prefix=s3_output_key)

    emr_origin >> s3_destination

    pipeline = builder.build(
        title='Amazon EMR to S3 pipeline').configure_for_environment(aws)
    configs = {
        'executionMode': 'EMR_BATCH',
        'amazonEMRConfig.userRegion': aws.sdc_formatted_region,
        'amazonEMRConfig.accessKey': aws.aws_access_key_id,
        'amazonEMRConfig.secretKey': aws.aws_secret_access_key,
        'amazonEMRConfig.s3StagingUri':
        f's3://{s3_staging_bucket}/{s3_staging_key}',
        'amazonEMRConfig.s3LogUri':
        f's3://{s3_staging_bucket}/{s3_logging_key}',
        'amazonEMRConfig.enableEMRDebugging': False
    }
    configs.update(pipeline_configs)
    pipeline.configuration.update(configs)
    sdc_executor.add_pipeline(pipeline)

    client = aws.s3
    try:
        logger.info('Creating input S3 data ...')
        [
            client.put_object(Bucket=s3_staging_bucket,
                              Key='{0}/{1}'.format(s3_input_key, i),
                              Body=raw_str) for i in range(s3_obj_count)
        ]

        # lets not wait for pipeline start, as the transition from START to RUNNING takes more time
        sdc_executor.start_pipeline(
            pipeline, wait=False).wait_for_finished(timeout_sec=1800)

        # assert record count to S3 the size of the objects put
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket,
                                              Prefix=s3_output_key)
        assert len(list_s3_objs['Contents']) == s3_obj_count

        # read data from S3 to assert it is what got ingested into the pipeline
        s3_contents = [
            client.get_object(
                Bucket=s3_bucket,
                Key=s3_content['Key'])['Body'].read().decode().strip()
            for s3_content in list_s3_objs['Contents']
        ]

        assert s3_contents == [raw_str] * s3_obj_count
    finally:
        logger.info('Deleting input S3 data ...')
        delete_keys = {
            'Objects': [{
                'Key': k['Key']
            } for k in client.list_objects_v2(Bucket=s3_bucket,
                                              Prefix=s3_input_key)['Contents']]
        }
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)

        logger.info('Deleting output S3 data ...')
        delete_keys = {
            'Objects': [{
                'Key': k['Key']
            } for k in client.list_objects_v2(
                Bucket=s3_bucket, Prefix=s3_output_key)['Contents']]
        }
        client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def test_mongodb_origin_simple_with_BSONBinary(sdc_builder, sdc_executor,
                                               mongodb):
    """
    Create 3 simple documents consists with BSON Binary data type in MongoDB and confirm that MongoDB origin reads them.

    The pipeline looks like:
        mongodb_origin >> trash
    """

    ORIG_BINARY_DOCS = [{
        'data': binary.Binary(b'Binary Data Flute')
    }, {
        'data': binary.Binary(b'Binary Data Oboe')
    }, {
        'data': binary.Binary(b'Binary Data Violin')
    }]

    pipeline_builder = sdc_builder.get_pipeline_builder()
    pipeline_builder.add_error_stage('Discard')

    mongodb_origin = pipeline_builder.add_stage('MongoDB', type='origin')
    mongodb_origin.set_attributes(capped_collection=False,
                                  database=get_random_string(ascii_letters, 5),
                                  collection=get_random_string(
                                      ascii_letters, 10))

    trash = pipeline_builder.add_stage('Trash')
    mongodb_origin >> trash
    pipeline = pipeline_builder.build().configure_for_environment(mongodb)

    try:
        # MongoDB and PyMongo add '_id' to the dictionary entries e.g. docs_in_database
        # when used for inserting in collection. Hence the deep copy.
        docs_in_database = copy.deepcopy(ORIG_BINARY_DOCS)

        # Create documents in MongoDB using PyMongo.
        # First a database is created. Then a collection is created inside that database.
        # Then documents are created in that collection.
        logger.info('Adding documents into %s collection using PyMongo...',
                    mongodb_origin.collection)
        mongodb_database = mongodb.engine[mongodb_origin.database]
        mongodb_collection = mongodb_database[mongodb_origin.collection]
        insert_list = [
            mongodb_collection.insert_one(doc) for doc in docs_in_database
        ]
        assert len(insert_list) == len(docs_in_database)

        # Start pipeline and verify the documents using snaphot.
        sdc_executor.add_pipeline(pipeline)
        snapshot = sdc_executor.capture_snapshot(pipeline=pipeline,
                                                 start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [{
            'data': str(record.value2['data'])
        } for record in snapshot[mongodb_origin].output]

        assert rows_from_snapshot == [{
            'data': str(record.get('data'))
        } for record in ORIG_BINARY_DOCS]

    finally:
        logger.info('Dropping %s database...', mongodb_origin.database)
        mongodb.engine.drop_database(mongodb_origin.database)
def test_kinesis_consumer_at_timestamp(sdc_builder, sdc_executor, aws):
    """Test for Kinesis consumer origin stage, with AT_TIMESTAMP option. We do so by:
        - 1. Publishing data to a test stream
        - 2. Wait some time and store current timestamp
        - 3. Publishing new data
        - 4. Using Kinesis client to attempt reading from stored timestamp, passing it to the AT_TIMESTAMP option
        - 5. Assert that only the newest data has been read

     The pipelines look like:

     Kinesis Consumer pipeline: kinesis_consumer >> trash
    """

    # build stream
    application_name = get_random_string()
    stream_name = f'{aws.kinesis_stream_prefix}_{get_random_string()}'

    client = aws.kinesis
    try:
        logger.info('Creating %s Kinesis stream on AWS ...', stream_name)
        client.create_stream(StreamName=stream_name, ShardCount=1)
        aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE')

        # 1. Publish data to the stream
        put_records = [{
            'Data': f'First Message {i}',
            'PartitionKey': '111'
        } for i in range(10)]
        client.put_records(Records=put_records, StreamName=stream_name)

        # 2. Wait and store timestamp
        time.sleep(10)
        timestamp = int(time.time()) * 1000

        # 3. Publish new data
        put_records = [{
            'Data': f'Second Message {i}',
            'PartitionKey': '111'
        } for i in range(10)]
        client.put_records(Records=put_records, StreamName=stream_name)

        # 4. Build consumer pipeline using timestamp
        builder = sdc_builder.get_pipeline_builder()
        builder.add_error_stage('Discard')
        kinesis_consumer = builder.add_stage('Kinesis Consumer')
        kinesis_consumer.set_attributes(application_name=application_name,
                                        data_format='TEXT',
                                        initial_position='AT_TIMESTAMP',
                                        initial_timestamp=timestamp,
                                        stream_name=stream_name)
        trash = builder.add_stage('Trash')
        kinesis_consumer >> trash

        consumer_origin_pipeline = builder.build(
            title='Kinesis Consumer pipeline').configure_for_environment(aws)
        sdc_executor.add_pipeline(consumer_origin_pipeline)

        # 5. messages are published, read through the pipeline and assert
        snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline,
                                                 start_pipeline=True,
                                                 batches=1).snapshot
        sdc_executor.stop_pipeline(consumer_origin_pipeline)
        output_records = [
            record.field
            for record in snapshot[kinesis_consumer.instance_name].output
        ]

        assert all('Second' in str(output_record)
                   for output_record in output_records)
    finally:
        logger.info('Deleting %s Kinesis stream on AWS ...', stream_name)
        client.delete_stream(
            StreamName=stream_name
        )  # Stream operations are done. Delete the stream.
        logger.info('Deleting %s DynamoDB table on AWS ...', application_name)
        aws.dynamodb.delete_table(TableName=application_name)
Example #18
0
def test_schema_generator_types(sdc_builder, input, converter_type,
                                expected_value, sdc_executor):
    # Test write directory
    tmp_directory = os.path.join(tempfile.gettempdir(),
                                 get_random_string(string.ascii_letters, 10))

    # Build pipeline that will generate test record and it's schema
    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Dev Raw Data Source')
    origin.data_format = 'JSON'
    origin.raw_data = json.dumps({"value": input})
    origin.stop_after_first_batch = True

    prefix = origin

    if converter_type != 'MAP' and converter_type != 'LIST':
        converter = builder.add_stage('Field Type Converter')
        converter.conversion_method = 'BY_FIELD'
        converter.field_type_converter_configs = [{
            'fields': ['/value'],
            'targetType': converter_type,
            'dataLocale': 'en,US',
            'dateFormat': 'YYYY_MM_DD_HH_MM_SS',
            'zonedDateTimeFormat': 'ISO_OFFSET_DATE_TIME',
            'scale': 2
        }]

        origin >> converter
        prefix = converter

    # Generate schema for that record
    schema_generator = builder.add_stage('Schema Generator')
    schema_generator.expand_types = True
    schema_generator.schema_name = 'test_schema'

    # And store it in local file system
    local_fs = builder.add_stage('Local FS', type='destination')
    local_fs.directory_template = tmp_directory
    local_fs.data_format = 'AVRO'
    local_fs.configuration[
        'configs.dataGeneratorFormatConfig.avroSchemaSource'] = 'HEADER'

    # Finish building the pipeline
    prefix >> schema_generator >> local_fs
    generator_pipeline = builder.build()

    # Build second pipeline that will read generated Avro file
    builder = sdc_builder.get_pipeline_builder()

    directory = builder.add_stage('Directory', type='origin')
    directory.data_format = 'AVRO'
    directory.batch_wait_time_in_secs = 10
    directory.file_name_pattern = 'sdc*'
    directory.files_directory = tmp_directory

    finisher = builder.add_stage("Pipeline Finisher Executor")
    finisher.stage_record_preconditions = [
        "${record:eventType() == 'no-more-data'}"
    ]
    directory >= finisher

    wiretap = builder.add_wiretap()

    directory >> wiretap.destination
    reader_pipeline = builder.build()

    sdc_executor.add_pipeline(generator_pipeline, reader_pipeline)
    # Start the pipelines one by one
    sdc_executor.start_pipeline(generator_pipeline).wait_for_finished()
    sdc_executor.start_pipeline(reader_pipeline).wait_for_finished()

    records = wiretap.output_records
    assert len(records) == 1
    assert records[0].field['value'] == expected_value
def test_hbase_empty_key_expression(sdc_builder, sdc_executor, cluster):
    """Check empty key expression in hbase lookup processor gives a configuration issue
    dev_raw_data_source >> hbase_lookup >> trash
    """
    # Generate some silly data.
    bike_races = [
        dict(name='Tour de France', first_edition='1903'),
        dict(name="Giro d'Italia", first_edition='1909'),
        dict(name='Vuelta a Espana', first_edition='1935')
    ]

    # Convert to raw data for the Dev Raw Data Source.
    raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races)

    # Generate HBase Lookup's attributes.
    lookup_parameters = [
        dict(rowExpr='',
             columnExpr='info:first_edition',
             outputFieldPath='/founded',
             timestampExpr='')
    ]

    # Get random table name to avoid collisions.
    table_name = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Source stage.
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT', raw_data=raw_data)

    # Create HBase Lookup processor.
    hbase_lookup = pipeline_builder.add_stage('HBase Lookup')
    hbase_lookup.set_attributes(lookup_parameters=lookup_parameters,
                                table_name=table_name)

    # Create trash destination.
    trash = pipeline_builder.add_stage('Trash')

    # Build pipeline.
    dev_raw_data_source >> hbase_lookup >> trash
    pipeline = pipeline_builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating HBase table %s ...', table_name)
        cluster.hbase.client.create_table(name=table_name,
                                          families={'info': {}})

        issues = sdc_executor.api_client.export_pipeline(
            pipeline.id)['pipelineConfig']['issues']
        assert 0 == issues['issueCount']

        # Start pipeline.
        with pytest.raises(Exception) as e:
            sdc_executor.start_pipeline(pipeline)
            sdc_executor.stop_pipeline(pipeline)
        assert 'HBASE_35' in e.value.message
        assert 'HBASE_35 - Row key field has empty value' in e.value.message

    finally:
        # Delete HBase table.
        logger.info('Deleting HBase table %s ...', table_name)
        cluster.hbase.client.delete_table(name=table_name, disable=True)
def topic():
    """Topic name used for this specific test."""
    topic = get_random_string(string.ascii_letters, 10)
    logger.debug('Using Topic: %s', topic)
    return topic
def test_hbase_lookup_processor_invalid_column_family(sdc_builder,
                                                      sdc_executor, cluster):
    """HBase Lookup processor test.
    pipeline will have an invalid column family, HBase_37 error expected ()
    dev_raw_data_source >> hbase_lookup >> trash
    """
    # Generate some silly data.
    bike_races = [
        dict(name='Tour de France', first_edition='1903'),
        dict(name="Giro d'Italia", first_edition='1909'),
        dict(name='Vuelta a Espana', first_edition='1935')
    ]

    # Convert to raw data for the Dev Raw Data Source.
    raw_data = '\n'.join(bike_race['name'] for bike_race in bike_races)

    # Generate HBase Lookup's attributes.
    lookup_parameters = [
        dict(rowExpr="${record:value('/text')}",
             columnExpr='info:first_edition',
             outputFieldPath='/founded',
             timestampExpr=''),
        dict(rowExpr="${record:value('/text')}",
             columnExpr='invalid:column',
             outputFieldPath='/founded',
             timestampExpr='')
    ]

    # Get random table name to avoid collisions.
    table_name = get_random_string(string.ascii_letters, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Source stage.
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(
        data_format='TEXT',
        raw_data=raw_data,
    )

    # Create HBase Lookup processor.
    hbase_lookup = pipeline_builder.add_stage('HBase Lookup')
    hbase_lookup.set_attributes(on_record_error='TO_ERROR',
                                lookup_parameters=lookup_parameters,
                                table_name=table_name)

    # Create trash destination.
    trash = pipeline_builder.add_stage('Trash')

    # Build pipeline.
    dev_raw_data_source >> hbase_lookup >> trash
    pipeline = pipeline_builder.build().configure_for_environment(cluster)
    pipeline.configuration['shouldRetry'] = False
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating HBase table %s ...', table_name)
        cluster.hbase.client.create_table(name=table_name,
                                          families={'info': {}})

        # Start pipeline.
        with pytest.raises(Exception) as e:
            sdc_executor.start_pipeline(pipeline)
            sdc_executor.stop_pipeline(pipeline)
        assert 'HBASE_36' in e.value.message

    finally:
        # Delete HBase table.
        logger.info('Deleting HBase table %s ...', table_name)
        cluster.hbase.client.delete_table(name=table_name, disable=True)
def test_elasticsearch_credentials_format(sdc_builder, sdc_executor,
                                          elasticsearch, join_credentials):
    """
    Elasticsearch target pipeline where specifies two different formats for the credential values.
    First, it checks if the previous format "username:password" is also valid and then update the pipeline with the new
    format, user name and password into two different fields, and checks again.
        dev_raw_data_source >> es_target
    """
    # Test static
    es_index = get_random_string(
        string.ascii_letters,
        10).lower()  # Elasticsearch indexes must be lower case
    es_mapping = get_random_string(string.ascii_letters, 10)
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    if join_credentials:
        username = elasticsearch.username + ':' + elasticsearch.password
        password = ''
    else:
        username = elasticsearch.username
        password = elasticsearch.password

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              stop_after_first_batch=True,
                                              raw_data=raw_str)
    es_target = builder.add_stage('Elasticsearch', type='destination')
    es_target.set_attributes(default_operation='INDEX',
                             document_id=es_doc_id,
                             index=es_index,
                             mapping=es_mapping,
                             use_security=True,
                             user_name=username,
                             password=password)

    dev_raw_data_source >> es_target
    es_target_pipeline = builder.build().configure_for_environment(
        elasticsearch)
    es_target_pipeline.configuration["shouldRetry"] = False

    sdc_executor.add_pipeline(es_target_pipeline)

    try:
        elasticsearch.client.create_index(es_index)

        # Run pipeline and read credential values from Elasticsearch to assert
        sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished()

        # Since we are upsert on the same index, map, doc - there should only be one document (index 0)
        response = elasticsearch.client.search(es_index)
        assert len(response) == 1
        assert response[0]['_index'] == es_index
        assert response[0]['_id'] == es_doc_id
        assert response[0]['_type'] == es_mapping
        assert response[0]['_source'] == {'text': raw_str}
    finally:
        # Clean up test data in ES
        elasticsearch.client.delete_index(es_index)
def test_postgres_cdc_client_basic(sdc_builder, sdc_executor, database):
    """Basic test that inserts/updates/deletes to a Postgres table,
    and validates that they are read in the same order.
    Here `Initial Change` config. is at default value = `From the latest change`.
    With this, the origin processes all changes that occur after pipeline is started.

    The pipeline looks like:
        postgres_cdc_client >> trash
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=False,
                                       replication_slot=replication_slot_name)
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline,
                                                         start_pipeline=True,
                                                         wait=False)

        # Create table and then perform insert, update and delete operations.
        table = _create_table_in_database(table_name, database)
        connection = database.engine.connect()
        expected_operations_data = _insert(connection=connection, table=table)
        expected_operations_data += _update(connection=connection, table=table)
        expected_operations_data += _delete(connection=connection, table=table)

        snapshot = snapshot_command.wait_for_finished().snapshot

        # Verify snapshot data is received in exact order as expected.
        operation_index = 0
        for record in snapshot[postgres_cdc_client.instance_name].output:
            # No need to worry about DDL related CDC records. e.g. table creation etc.
            if record.get_field_data('/change'):
                # Since we performed each operation (insert, update and delete) on 3 rows,
                # each CDC  record change contains a list of 3 elements.
                for i in range(3):
                    expected = expected_operations_data[operation_index]
                    assert expected.kind == record.get_field_data(
                        f'/change[{i}]/kind')
                    assert expected.table == record.get_field_data(
                        f'/change[{i}]/table')
                    # For delete operation there are no columnnames and columnvalues fields.
                    if expected.kind != KIND_FOR_DELETE:
                        assert expected.columnnames == record.get_field_data(
                            f'/change[{i}]/columnnames')
                        assert expected.columnvalues == record.get_field_data(
                            f'/change[{i}]/columnvalues')
                    if expected.kind != KIND_FOR_INSERT:
                        # For update and delete operations verify extra information about old keys.
                        assert expected.oldkeys.keynames == record.get_field_data(
                            f'/change[{i}]/oldkeys/keynames')
                        assert expected.oldkeys.keyvalues == record.get_field_data(
                            f'/change[{i}]/oldkeys/keyvalues')
                    operation_index += 1

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
def _run_test_s3_error_destination(sdc_builder, sdc_executor, aws, anonymous):
    try:
        if anonymous:
            s3_bucket = create_bucket(aws)
            logger.info(f'Bucket {s3_bucket} created')
        else:
            s3_bucket = aws.s3_bucket_name

        s3_key = f'{S3_SANDBOX_PREFIX}/errDest-{get_random_string()}/'
        random_string = get_random_string(string.ascii_letters, 10)
        random_raw_json_str = f'{{"text":"{random_string}"}}'

        # Build pipeline.
        builder = sdc_builder.get_pipeline_builder()
        s3_err = builder.add_error_stage('Write to Amazon S3')
        s3_err.set_attributes(bucket=s3_bucket, common_prefix=s3_key)
        if anonymous:
            configure_stage_for_anonymous(s3_err)

        origin = builder.add_stage('Dev Raw Data Source', type='origin')
        origin.set_attributes(data_format='JSON',
                              raw_data=random_raw_json_str,
                              stop_after_first_batch=True)

        target = builder.add_stage('To Error', type='destination')

        origin >> target

        pipeline = builder.build().configure_for_environment(aws)
        pipeline.configuration['shouldRetry'] = False
        sdc_executor.add_pipeline(pipeline)

        # Now we build and run another pipeline with an S3 Origin to read the data back
        builder = sdc_builder.get_pipeline_builder()
        s3_origin = builder.add_stage('Amazon S3', type='origin')
        s3_origin.set_attributes(bucket=s3_bucket,
                                 data_format='SDC_JSON',
                                 prefix_pattern=f'{s3_key}*',
                                 max_batch_size_in_records=100)
        if anonymous:
            configure_stage_for_anonymous(s3_origin)

        wiretap = builder.add_wiretap()
        finisher = builder.add_stage('Pipeline Finisher Executor')
        finisher.set_attributes(stage_record_preconditions=[
            "${record:eventType() == 'no-more-data'}"
        ])

        s3_origin >> wiretap.destination
        s3_origin >= finisher

        read_pipeline = builder.build().configure_for_environment(aws)
        read_pipeline.configuration['shouldRetry'] = False
        sdc_executor.add_pipeline(read_pipeline)

        client = aws.s3
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        # We should have exactly one file in the bucket
        list_s3_objs = client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key)
        assert 'Contents' in list_s3_objs  # If no object was found, there is no 'Contents' key
        assert len(list_s3_objs['Contents']) == 1

        sdc_executor.start_pipeline(read_pipeline).wait_for_finished()
        assert len(wiretap.output_records) == 1
        assert [record.field['text']
                for record in wiretap.output_records][0] == random_string
    finally:
        try:
            aws.delete_s3_data(s3_bucket, s3_key)
        finally:
            if anonymous:
                logger.info(f'Deleting bucket {s3_bucket}')
                aws.s3.delete_bucket(Bucket=s3_bucket)
def test_postgres_cdc_client_remove_replication_slot(sdc_builder, sdc_executor,
                                                     database):
    """
        Test the 'Remove replication slot on close' functionality

        1.  Initialize and start pipeline with specified replication slot
        2.  Pass some data
        3.  Stop the pipeline
        4.  Query postgres database for replication slots, checking removal
    """
    if database.database_server_version < databases.EARLIEST_POSTGRESQL_VERSION_WITH_ACTIVE_PID:
        # Test only runs against PostgreSQL version with active_pid column in pg_replication_slots.
        pytest.skip(
            'Test only runs against PostgreSQL version >= '
            f"{'.'.join(str(item) for item in databases.EARLIEST_POSTGRESQL_VERSION_WITH_ACTIVE_PID)}"
        )
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)
    replication_slot = get_random_string(string.ascii_lowercase, 10)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True,
                                       replication_slot=replication_slot)
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline,
                                                         start_pipeline=True,
                                                         wait=False)

        # Create table and then perform some operations to simulate activity
        table = _create_table_in_database(table_name, database)
        connection = database.engine.connect()
        expected_operations_data = _insert(connection=connection, table=table)
        expected_operations_data += _update(connection=connection, table=table)
        expected_operations_data += _delete(connection=connection, table=table)

        snapshot = snapshot_command.wait_for_finished().snapshot

        # Timeout is set as without SDC-11252, pipeline will get stuck in 'STOPPING' state forever
        sdc_executor.stop_pipeline(pipeline=pipeline).wait_for_stopped(
            timeout_sec=60)

        # After pipeline stoppage, check on the replication slots remaining
        listed_slots = connection.execute(CHECK_REP_SLOT_QUERY).fetchall()

        # Check that replication_slot is not in listed_slots
        logger.info('Replication slot:  ' + replication_slot)
        logger.info('List of current slots: ' + str(listed_slots))
        assert (replication_slot, ) not in listed_slots

    finally:
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
def test_kinesis_consumer(sdc_builder, sdc_executor, aws):
    """Test for Kinesis consumer origin stage. We do so by publishing data to a test stream using Kinesis client and
    having a pipeline which reads that data using Kinesis consumer origin stage. Data is then asserted for what is
    published at Kinesis client and what we read in the pipeline snapshot. The pipeline looks like:

    Kinesis Consumer pipeline:
        kinesis_consumer >> trash
    """
    # build consumer pipeline
    application_name = get_random_string(string.ascii_letters, 10)
    stream_name = '{}_{}'.format(aws.kinesis_stream_prefix,
                                 get_random_string(string.ascii_letters, 10))

    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    kinesis_consumer = builder.add_stage('Kinesis Consumer')
    kinesis_consumer.set_attributes(application_name=application_name,
                                    data_format='TEXT',
                                    initial_position='TRIM_HORIZON',
                                    stream_name=stream_name)

    trash = builder.add_stage('Trash')

    kinesis_consumer >> trash

    consumer_origin_pipeline = builder.build(
        title='Kinesis Consumer pipeline').configure_for_environment(aws)
    sdc_executor.add_pipeline(consumer_origin_pipeline)

    # run pipeline and capture snapshot
    client = aws.kinesis
    try:
        logger.info('Creating %s Kinesis stream on AWS ...', stream_name)
        client.create_stream(StreamName=stream_name, ShardCount=1)
        aws.wait_for_stream_status(stream_name=stream_name, status='ACTIVE')

        expected_messages = set('Message {0}'.format(i) for i in range(10))
        # not using PartitionKey logic and hence assign some temp key
        put_records = [{
            'Data': exp_msg,
            'PartitionKey': '111'
        } for exp_msg in expected_messages]
        client.put_records(Records=put_records, StreamName=stream_name)

        # messages are published, read through the pipeline and assert
        snapshot = sdc_executor.capture_snapshot(consumer_origin_pipeline,
                                                 start_pipeline=True).snapshot
        sdc_executor.stop_pipeline(consumer_origin_pipeline)

        output_records = [
            record.field['text'].value
            for record in snapshot[kinesis_consumer.instance_name].output
        ]

        assert set(output_records) == expected_messages
    finally:
        logger.info('Deleting %s Kinesis stream on AWS ...', stream_name)
        client.delete_stream(
            StreamName=stream_name
        )  # Stream operations are done. Delete the stream.
        logger.info('Deleting %s DynamoDB table on AWS ...', application_name)
        aws.dynamodb.delete_table(TableName=application_name)
Example #27
0
def test_ftp_destination(sdc_builder, sdc_executor, ftp):
    """Smoke test FTP destination. We first create a local file using Local FS destination stage and use that file
    for FTP destination stage to see if it gets successfully uploaded.
    The pipelines look like:
        dev_raw_data_source >> local_fs
        directory >> sftp_ftp_client
    """
    # Our destination FTP file name
    ftp_file_name = get_random_string(string.ascii_letters, 10)
    # Local temporary directory where we will create a source file to be uploaded to FTP server
    local_tmp_directory = os.path.join(tempfile.gettempdir(), get_random_string(string.ascii_letters, 10))

    # Build source file pipeline logic
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.data_format = 'TEXT'
    dev_raw_data_source.raw_data = 'Hello World!'
    dev_raw_data_source.stop_after_first_batch = True

    local_fs = builder.add_stage('Local FS', type='destination')
    local_fs.directory_template = local_tmp_directory
    local_fs.data_format = 'TEXT'

    dev_raw_data_source >> local_fs
    local_fs_pipeline = builder.build('Local FS Pipeline')

    builder = sdc_builder.get_pipeline_builder()

    # Build FTP destination pipeline logic
    directory = builder.add_stage('Directory', type='origin')
    directory.data_format = 'WHOLE_FILE'
    directory.file_name_pattern = 'sdc*'
    directory.files_directory = local_tmp_directory

    sftp_ftp_client = builder.add_stage('SFTP/FTP Client', type='destination')
    sftp_ftp_client.file_name_expression = ftp_file_name

    directory >> sftp_ftp_client
    sftp_ftp_client_pipeline = builder.build('FTP Destination Pipeline').configure_for_environment(ftp)

    sdc_executor.add_pipeline(local_fs_pipeline, sftp_ftp_client_pipeline)

    # Start source file creation pipeline and assert file has been created with expected number of records
    sdc_executor.start_pipeline(local_fs_pipeline).wait_for_finished()
    history = sdc_executor.get_pipeline_history(local_fs_pipeline)
    assert history.latest.metrics.counter('pipeline.batchInputRecords.counter').count == 1
    assert history.latest.metrics.counter('pipeline.batchOutputRecords.counter').count == 1

    # Start FTP upload (destination) file pipeline and assert pipeline has processed expected number of files
    sdc_executor.start_pipeline(sftp_ftp_client_pipeline).wait_for_pipeline_output_records_count(1)
    sdc_executor.stop_pipeline(sftp_ftp_client_pipeline)
    history = sdc_executor.get_pipeline_history(sftp_ftp_client_pipeline)
    assert history.latest.metrics.counter('pipeline.batchInputRecords.counter').count == 1
    assert history.latest.metrics.counter('pipeline.batchOutputRecords.counter').count == 1

    # Read FTP destination file and compare our source data to assert
    assert ftp.get_string(ftp_file_name) == dev_raw_data_source.raw_data

    # Delete the test FTP destination file we created
    client = ftp.client
    try:
        client.delete(ftp_file_name)
    finally:
        client.quit()
def test_firehose_destination_to_s3(sdc_builder, sdc_executor, aws):
    """Test for Firehose target stage. This test assumes Firehose is destined to S3 bucket. We run a dev raw data source
    generator to Firehose destination which is pre-setup to put to S3 bucket. We then read S3 bucket using STF client
    to assert data between the client to what has been ingested into the pipeline. The pipeline looks like:

    Firehose Destination pipeline:
        dev_raw_data_source >> record_deduplicator >> firehose_destination
                                                   >> to_error
    """
    s3_client = aws.s3
    firehose_client = aws.firehose

    # setup test static
    s3_bucket = aws.s3_bucket_name
    stream_name = aws.firehose_stream_name
    # json formatted string
    random_raw_str = '{{"text":"{0}"}}'.format(
        get_random_string(string.ascii_letters, 10))
    record_count = 1  # random_raw_str record size
    s3_put_keys = []

    # Build the pipeline
    builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              raw_data=random_raw_str)

    record_deduplicator = builder.add_stage('Record Deduplicator')
    to_error = builder.add_stage('To Error')

    firehose_destination = builder.add_stage('Kinesis Firehose')
    firehose_destination.set_attributes(stream_name=stream_name,
                                        data_format='JSON')

    dev_raw_data_source >> record_deduplicator >> firehose_destination
    record_deduplicator >> to_error

    firehose_dest_pipeline = builder.build(
        title='Amazon Firehose destination pipeline'
    ).configure_for_environment(aws)
    sdc_executor.add_pipeline(firehose_dest_pipeline)

    try:
        # start pipeline and assert
        sdc_executor.start_pipeline(
            firehose_dest_pipeline).wait_for_pipeline_output_records_count(
                record_count)
        sdc_executor.stop_pipeline(firehose_dest_pipeline)

        # wait till data is available in S3. We do so by querying for buffer wait time and sleep till then
        resp = firehose_client.describe_delivery_stream(
            DeliveryStreamName=stream_name)
        dests = resp['DeliveryStreamDescription']['Destinations'][0]
        wait_secs = dests['ExtendedS3DestinationDescription'][
            'BufferingHints']['IntervalInSeconds']
        time.sleep(
            wait_secs +
            15)  # few seconds more to wait to make sure S3 gets the data

        # Firehose S3 object naming http://docs.aws.amazon.com/firehose/latest/dev/basic-deliver.html#s3-object-name
        # read data to assert
        list_s3_objs = s3_client.list_objects_v2(
            Bucket=s3_bucket, Prefix=datetime.utcnow().strftime("%Y/%m/%d"))
        for s3_content in list_s3_objs['Contents']:
            akey = s3_content['Key']
            aobj = s3_client.get_object(Bucket=s3_bucket, Key=akey)
            if aobj['Body'].read().decode().strip() == random_raw_str:
                s3_put_keys.append(akey)

        assert len(s3_put_keys) == record_count
    finally:
        # delete S3 objects related to this test
        if len(s3_put_keys) > 0:
            delete_keys = {'Objects': [{'Key': k} for k in s3_put_keys]}
            s3_client.delete_objects(Bucket=s3_bucket, Delete=delete_keys)
def test_hadoop_fs_origin_simple(sdc_builder, sdc_executor, cluster):
    """Write a simple file into a Hadoop FS folder with a randomly-generated name and confirm that the Hadoop FS origin
    successfully reads it. Because cluster mode pipelines don't support snapshots, we do this verification using a
    second standalone pipeline whose origin is an SDC RPC written to by the Hadoop FS pipeline. Specifically, this would
    look like:
    Hadoop FS pipeline:
        hadoop_fs_origin >> sdc_rpc_destination
    Snapshot pipeline:
        sdc_rpc_origin >> trash
    """
    hadoop_fs_folder = '/tmp/out/{}'.format(get_random_string(string.ascii_letters, 10))

    # Build the Hadoop FS pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    hadoop_fs = builder.add_stage('Hadoop FS', type='origin')
    hadoop_fs.data_format = 'TEXT'
    hadoop_fs.input_paths.append(hadoop_fs_folder)

    sdc_rpc_destination = builder.add_stage('SDC RPC', type='destination')
    sdc_rpc_destination.sdc_rpc_connection.append('{}:{}'.format(sdc_executor.server_host,
                                                                 SDC_RPC_LISTENING_PORT))
    sdc_rpc_destination.sdc_rpc_id = get_random_string(string.ascii_letters, 10)

    hadoop_fs >> sdc_rpc_destination
    hadoop_fs_pipeline = builder.build(title='Hadoop FS pipeline').configure_for_environment(cluster)
    hadoop_fs_pipeline.configuration['executionMode'] = 'CLUSTER_BATCH'

    # Build the Snapshot pipeline.
    builder = sdc_builder.get_pipeline_builder()
    builder.add_error_stage('Discard')

    sdc_rpc_origin = builder.add_stage('SDC RPC', type='origin')
    sdc_rpc_origin.sdc_rpc_listening_port = SDC_RPC_LISTENING_PORT
    sdc_rpc_origin.sdc_rpc_id = sdc_rpc_destination.sdc_rpc_id
    # Since YARN jobs take a while to get going, set RPC origin batch wait time to 5 min. to avoid
    # getting an empty batch in the snapshot.
    sdc_rpc_origin.batch_wait_time_in_secs = 300

    trash = builder.add_stage('Trash')

    sdc_rpc_origin >> trash
    snapshot_pipeline = builder.build(title='Snapshot pipeline')

    # Add both pipelines we just created to SDC and start writing files to Hadoop FS with the HDFS client.
    sdc_executor.add_pipeline(hadoop_fs_pipeline, snapshot_pipeline)

    try:
        lines_in_file = ['hello', 'hi', 'how are you?']

        logger.debug('Writing file %s/file.txt to Hadoop FS ...', hadoop_fs_folder)
        cluster.hdfs.client.makedirs(hadoop_fs_folder)
        cluster.hdfs.client.write(os.path.join(hadoop_fs_folder, 'file.txt'), data='\n'.join(lines_in_file))

        # So here's where we do the clever stuff. We use SDC's capture snapshot endpoint to start and begin
        # capturing a snapshot from the snapshot pipeline. We do this, however, without using the synchronous
        # wait_for_finished function. That way, we can switch over and start the Hadoop FS pipeline. Once that one
        # completes, we can go back and do an assert on the snapshot pipeline's snapshot.
        logger.debug('Starting snapshot pipeline and capturing snapshot ...')
        snapshot_pipeline_command = sdc_executor.capture_snapshot(snapshot_pipeline, start_pipeline=True,
                                                                  wait=False)

        logger.debug('Starting Hadoop FS pipeline and waiting for it to finish ...')
        sdc_executor.start_pipeline(hadoop_fs_pipeline)

        snapshot = snapshot_pipeline_command.wait_for_finished(timeout_sec=120).snapshot
        sdc_executor.stop_pipeline(snapshot_pipeline, force=True)
        lines_from_snapshot = [record.field['text'].value
                               for record in snapshot[snapshot_pipeline[0].instance_name].output]

        assert lines_from_snapshot == lines_in_file
    finally:
        cluster.hdfs.client.delete(hadoop_fs_folder, recursive=True)
def test_file_tale_origin_stop_continue(sdc_builder, sdc_executor):
    """Test File Tail Origin. We test by making sure files are pre-created using Local FS destination stage pipeline
    and then have the File Tail Origin read those files. The pipelines looks like:

        dev_raw_data_source >> local_fs

        file_tail >> trash
    """
    raw_data = 'Hello!\n' * 10
    tmp_directory = os.path.join(tempfile.gettempdir(),
                                 get_random_string(string.ascii_letters, 10))

    # 1st pipeline which generates the required files for Directory Origin
    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT',
                                       raw_data=raw_data,
                                       stop_after_first_batch=True)
    local_fs = pipeline_builder.add_stage('Local FS', type='destination')
    local_fs.set_attributes(data_format='TEXT',
                            directory_template=tmp_directory,
                            files_prefix='sdc-',
                            max_records_in_file=100)

    dev_raw_data_source >> local_fs
    files_pipeline = pipeline_builder.build('Generate file for start-stop')
    sdc_executor.add_pipeline(files_pipeline)
    sdc_executor.start_pipeline(files_pipeline)

    # 2nd pipeline which reads the files using File Tail stage
    pipeline_builder = sdc_builder.get_pipeline_builder()
    file_tail = pipeline_builder.add_stage('File Tail', type='origin')
    file_tail.set_attributes(data_format='TEXT',
                             file_to_tail=[{
                                 'fileRollMode': 'ALPHABETICAL',
                                 'fileFullPath': f'{tmp_directory}/*'
                             }])
    wiretap_1 = pipeline_builder.add_wiretap()
    wiretap_2 = pipeline_builder.add_wiretap()

    file_tail >> wiretap_1.destination
    file_tail >> wiretap_2.destination

    file_tail_pipeline = pipeline_builder.build('File Tail Origin pipeline')
    sdc_executor.add_pipeline(file_tail_pipeline)

    sdc_executor.start_pipeline(file_tail_pipeline)
    sdc_executor.wait_for_pipeline_metric(file_tail_pipeline,
                                          'data_batch_count', 1)
    sdc_executor.stop_pipeline(file_tail_pipeline)

    # assert all the data captured have the same raw_data
    # the wiretap output has a dict of {key: Record(s), key: EventRecord} Iterate and assert only Record(s)
    # by checking a Record having a key called 'text'

    size_output = 0

    for record in wiretap_1.output_records:
        if 'text' in record.field:
            assert 'Hello!' == record.field['text'].value
            size_output += 1
    for record in wiretap_2.output_records:
        if 'text' in record.field:
            assert 'Hello!' == record.field['text'].value
            size_output += 1

    assert size_output == 10

    raw_data = 'Bye!\n' * 10
    # 2n pipeline which generates the required files for Directory Origin
    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='TEXT',
                                       raw_data=raw_data,
                                       stop_after_first_batch=True)
    local_fs = pipeline_builder.add_stage('Local FS', type='destination')
    local_fs.set_attributes(data_format='TEXT',
                            directory_template=tmp_directory,
                            files_prefix='sdc-',
                            max_records_in_file=100)

    dev_raw_data_source >> local_fs
    files_pipeline_2 = pipeline_builder.build('Generate file for start-stop 2')
    sdc_executor.add_pipeline(files_pipeline_2)
    sdc_executor.start_pipeline(files_pipeline_2).wait_for_finished()

    wiretap_1.reset()
    wiretap_2.reset()
    sdc_executor.start_pipeline(file_tail_pipeline)
    sdc_executor.wait_for_pipeline_metric(file_tail_pipeline,
                                          'data_batch_count', 1)
    sdc_executor.stop_pipeline(file_tail_pipeline)

    size_output = 0

    for record in wiretap_1.output_records:
        if 'text' in record.field:
            assert 'Bye!' == record.field['text'].value
            size_output += 1
    for record in wiretap_2.output_records:
        if 'text' in record.field:
            assert 'Bye!' == record.field['text'].value
            size_output += 1

    assert size_output == 10