def test_mapr_db_destination(sdc_builder, sdc_executor, cluster):
    """Write a handful of records to the MapR-DB destination and confirm their presence with an HBase client.

    dev_raw_data_source >> mapr_db
    """
    # Generate some data.
    bike_brands = [dict(name='Bianchi'),
                   dict(name='BMC'),
                   dict(name='Cannondale'),
                   dict(name='Specialized')]
    raw_data = ''.join(json.dumps(brand) for brand in bike_brands)

    table_name = '/user/sdc/{}'.format(get_random_string(string.ascii_letters, 10))

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       raw_data=raw_data)

    mapr_db = pipeline_builder.add_stage('MapR DB', type='destination')
    mapr_db.set_attributes(table_name=table_name,
                           row_key='/name',
                           fields=[dict(columnValue='/name',
                                        columnStorageType='TEXT',
                                        columnName='cf1:cq1')])

    dev_raw_data_source >> mapr_db
    pipeline = pipeline_builder.build().configure_for_environment(cluster)

    try:
        logger.info('Creating MapR-DB table %s ...', table_name)
        cluster.execute_command('table', 'create', http_request_method='POST',
                                data={'path': table_name,
                                      'defaultreadperm': 'p',
                                      'defaultwriteperm': 'p'})
        cluster.execute_command('table', 'cf', 'create', http_request_method='POST',
                                data={'path': table_name, 'cfname': 'cf1'})

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(bike_brands))

        table = cluster.mapr_db.client.table(name=table_name)
        # Due to the following bug in MapR 6.0.1 MEP 5.0, MapR DB table.scan() call hangs and times out.
        # https://mapr.com/support/s/article/Hung-issue-when-using-HappyBase-python-to-SCAN-MapRDB?language=ja%29
        # Hence read the database table by using table.row() call instead of whole table scan.
        result = [(bike_brand['name'].encode(), table.row(bike_brand['name'].encode()))
                  for bike_brand in bike_brands]
        # Bike brands are stored in a list of dicts ('name' => brand). Manipulate this to match what we
        # expect our MapR-DB rows to look like (including putting them in lexicographic order).
        assert [(bike_brand['name'].encode(), {b'cf1:cq1': bike_brand['name'].encode()})
                for bike_brand in bike_brands] == result

    finally:
        logger.info('Deleting MapR-DB table %s ...', table_name)
        cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_name})
        sdc_executor.stop_pipeline(pipeline)
def test_mapr_json_db_cdc_origin(sdc_builder, sdc_executor, cluster):
    """Insert, update, delete a handful of records in the MapR-DB json table using a pipeline.
    After that create another pipeline with CDC Consumer and verify with snapshot that MapR DB CDC
    consumer gets the correct data.

    dev_raw_data_source >> expression evaluator >> field_remover >> mapr_db_json
    mapr_db_cdc_consumer >> trash
    """
    if not cluster.version[len('mapr'):].startswith('6'):
        pytest.skip(
            'MapR CDC test only runs against cluster with MapR version 6.')
    if cluster.mep_version == "4.0":
        pytest.skip('MapR CDC test are written only for MEP 5 and above.')

    table_name = get_random_string(string.ascii_letters, 10)
    topic_name = f'{table_name}-topic'
    table_path = f'/user/sdc/{table_name}'
    stream_name = f'/{get_random_string(string.ascii_letters, 10)}'

    # Generate some data.
    test_data = [
        dict(_id='1',
             name='Sachin Tendulkar',
             operation='insert',
             average=53.79,
             is_alive=True,
             runs_bf=1592129437,
             innings=329),
        dict(_id='2',
             name='Don Bradman',
             operation='insert',
             average=53.79,
             is_alive=False,
             runs_bf=69969798,
             innings=80),
        dict(_id='3',
             name='Gary Sobers',
             operation='insert',
             average=57.78,
             is_alive=True,
             runs_bf=80323867,
             innings=160),
        dict(_id='1', name='Sachin', operation='update'),
        dict(_id='2', name='Don', operation='update'),
        dict(_id='3', operation='delete')
    ]
    raw_data = ''.join(json.dumps(record) for record in test_data)

    # Expected final data, field remover stage will have the operation field removed
    final_data = [
        dict(_id='1',
             name='Sachin',
             average=53.79,
             is_alive=True,
             runs_bf=1592129437,
             innings=329),
        dict(_id='2',
             name='Don',
             average=53.79,
             is_alive=False,
             runs_bf=69969798,
             innings=80)
    ]

    # Build the MapR JSON DB pipeline.
    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='JSON',
                                              stop_after_first_batch=True,
                                              raw_data=raw_data)
    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    header_attribute_expressions = (
        "${record:value('/operation')=='insert'?1:"
        "record:value('/operation')=='update'?3:"
        "record:value('/operation')=='delete'?2:1}")
    expression_evaluator.set_attributes(header_attribute_expressions=[{
        'attributeToSet':
        'sdc.operation.type',
        'headerAttributeExpression':
        header_attribute_expressions
    }])
    field_remover = pipeline_builder.add_stage('Field Remover')
    field_remover.set_attributes(fields=['/operation'])
    mapr_db_json_destination = pipeline_builder.add_stage('MapR DB JSON',
                                                          type='destination')
    mapr_db_json_destination.set_attributes(table_name=table_path,
                                            row_key='/_id')

    dev_raw_data_source >> expression_evaluator >> field_remover >> mapr_db_json_destination
    json_db_destination_pipeline = pipeline_builder.build(
        'MapR Json DB Destination').configure_for_environment(cluster)

    # Build the MapR DB CDC Consumer pipeline.
    pipeline_builder = sdc_builder.get_pipeline_builder()
    mapr_db_cdc_consumer = pipeline_builder.add_stage('MapR DB CDC Consumer',
                                                      type='origin')
    mapr_db_cdc_consumer.set_attributes(
        mapr_streams_configuration=[
            dict(key='auto.offset.reset', value='earliest')
        ],
        number_of_threads=1,
        topic_list=[
            dict(key=f'{stream_name}:{topic_name}', value=f'{table_path}')
        ])

    trash = pipeline_builder.add_stage('Trash')
    mapr_db_cdc_consumer >> trash
    cdc_pipeline = pipeline_builder.build(
        'MapR DB CDC Consumer').configure_for_environment(cluster)

    # Build the MapR DB JSON Consumer pipeline.
    pipeline_builder = sdc_builder.get_pipeline_builder()
    mapr_db_json_origin = pipeline_builder.add_stage('MapR DB JSON Origin')
    mapr_db_json_origin.set_attributes(table_name=table_path)
    trash = pipeline_builder.add_stage('Trash')
    mapr_db_json_origin >> trash
    json_db_origin_pipeline = pipeline_builder.build(
        'MapR Json DB Origin').configure_for_environment(cluster)

    try:
        logger.info('Creating MapR-DB table %s ...', table_path)
        cluster.execute_command('table',
                                'create',
                                http_request_method='POST',
                                data={
                                    'path': table_path,
                                    'defaultreadperm': 'p',
                                    'tabletype': 'json',
                                    'defaultwriteperm': 'p'
                                })

        logger.info('Creating MapR stream %s ...', stream_name)
        cluster.execute_command('stream',
                                'create',
                                http_request_method='POST',
                                data={
                                    'path': stream_name,
                                    'ischangelog': 'true',
                                    'consumeperm': 'p',
                                    'defaultpartitions': 1
                                })

        changelog = f'{stream_name}:{topic_name}'
        logger.info('Creating MapR-DB table changelog %s ...', changelog)
        cluster.execute_command('table',
                                'changelog',
                                'add',
                                http_request_method='POST',
                                data={
                                    'path': table_path,
                                    'changelog': changelog
                                })

        sdc_executor.add_pipeline(json_db_destination_pipeline, cdc_pipeline,
                                  json_db_origin_pipeline)
        sdc_executor.start_pipeline(json_db_destination_pipeline)

        cdc_pipeline_command = sdc_executor.capture_snapshot(
            cdc_pipeline, start_pipeline=True, wait=False)
        json_origin_pipeline_command = sdc_executor.capture_snapshot(
            json_db_origin_pipeline, start_pipeline=True, wait=False)

        # Verify with a snapshot.
        cdc_snapshot = cdc_pipeline_command.wait_for_finished(
            timeout_sec=120).snapshot
        json_snapshot = json_origin_pipeline_command.wait_for_finished(
            timeout_sec=120).snapshot
        sdc_executor.stop_pipeline(cdc_pipeline)
        sdc_executor.stop_pipeline(json_db_origin_pipeline)

        actual_cdc = [
            record.field
            for record in cdc_snapshot[mapr_db_cdc_consumer].output
        ]
        for record in test_data:
            # In the pipeline, Field Remover stage removed field 'operation' and so it will not be present in actual.
            # Remove it from test_data, for verification with assert.
            record.pop('operation')

        actual_json = [
            record.field
            for record in json_snapshot[mapr_db_json_origin].output
        ]

        assert actual_cdc == test_data
        assert actual_json == final_data
    finally:
        logger.info('Deleting MapR-DB table changelog %s ...',
                    f'{stream_name}:{topic_name}')
        cluster.execute_command('table',
                                'changelog',
                                'remove',
                                http_request_method='POST',
                                data={
                                    'path': table_path,
                                    'changelog': f'{stream_name}:{topic_name}'
                                })
        logger.info('Deleting MapR stream %s ...', stream_name)
        cluster.execute_command('stream',
                                'delete',
                                http_request_method='POST',
                                data={'path': stream_name})
        logger.info('Deleting MapR-DB table %s ...', table_path)
        cluster.execute_command('table',
                                'delete',
                                http_request_method='POST',
                                data={'path': table_path})
Example #3
0
def test_mapr_db_cdc_origin_preview(sdc_builder, sdc_executor, cluster, input_records):
    """We had an issue in which preview pipeline committed records read from streams, which made actual runs not
    read those records. This test will preview the pipeline and then assert we have the expected number of records.

    dev_data_generator >> expression evaluator >> field_remover >> mapr_db_json
    mapr_db_cdc_consumer >> wiretap
    """
    if not cluster.version[len('mapr'):].startswith('6'):
        pytest.skip('MapR CDC test only runs against cluster with MapR version 6.')
    if cluster.mep_version == "4.0":
        pytest.skip('MapR CDC test are written only for MEP 5 and above.')

    table_name = get_random_string(string.ascii_letters, 10)
    topic_name = f'{table_name}-topic'
    table_path = f'/user/sdc/{table_name}'
    stream_name = f'/{get_random_string(string.ascii_letters, 10)}'

    # Build the MapR JSON DB pipeline.
    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')
    dev_data_generator.set_attributes(records_to_be_generated=input_records)
    dev_data_generator.fields_to_generate = [
        {'field': '_id', 'type': 'STRING'},
        {'field': 'name', 'type': 'STRING'},
        {'field': 'address', 'type': 'STRING'},
        {'field': 'mail', 'type': 'STRING'},
    ]

    expression_evaluator = pipeline_builder.add_stage('Expression Evaluator')
    header_attribute_expressions = ("${record:value('/operation')=='insert'?1:"
                                    "record:value('/operation')=='update'?3:"
                                    "record:value('/operation')=='delete'?2:1}")
    expression_evaluator.set_attributes(header_attribute_expressions=[
        {'attributeToSet': 'sdc.operation.type',
         'headerAttributeExpression': header_attribute_expressions}
    ])
    field_remover = pipeline_builder.add_stage('Field Remover')
    field_remover.set_attributes(fields=['/operation'])
    mapr_db_json_destination = pipeline_builder.add_stage('MapR DB JSON', type='destination')
    mapr_db_json_destination.set_attributes(table_name=table_path, row_key='/_id')

    dev_data_generator >> expression_evaluator >> field_remover >> mapr_db_json_destination
    json_db_destination_pipeline = pipeline_builder.build('MapR Json DB Destination').configure_for_environment(cluster)

    # Build the MapR DB CDC Consumer pipeline.
    pipeline_builder = sdc_builder.get_pipeline_builder()
    mapr_db_cdc_consumer = pipeline_builder.add_stage('MapR DB CDC Consumer', type='origin')
    mapr_db_cdc_consumer.set_attributes(mapr_streams_configuration=[dict(key='auto.offset.reset',
                                                                         value='earliest')],
                                        number_of_threads=1,
                                        topic_list=[dict(key=f'{stream_name}:{topic_name}',
                                                         value=f'{table_path}')])

    wiretap_cdc = pipeline_builder.add_wiretap()
    mapr_db_cdc_consumer >> wiretap_cdc.destination
    cdc_pipeline = pipeline_builder.build('MapR DB CDC Consumer').configure_for_environment(cluster)

    try:
        logger.info('Creating MapR-DB table %s ...', table_path)
        cluster.execute_command('table', 'create', http_request_method='POST',
                                data={'path': table_path,
                                      'defaultreadperm': 'p',
                                      'tabletype': 'json',
                                      'defaultwriteperm': 'p'})

        logger.info('Creating MapR stream %s ...', stream_name)
        cluster.execute_command('stream', 'create', http_request_method='POST',
                                data={'path': stream_name,
                                      'ischangelog': 'true',
                                      'consumeperm': 'p',
                                      'defaultpartitions': 1})

        changelog = f'{stream_name}:{topic_name}'
        logger.info('Creating MapR-DB table changelog %s ...', changelog)
        cluster.execute_command('table', 'changelog', 'add', http_request_method='POST',
                                data={'path': table_path,
                                      'changelog': changelog})

        sdc_executor.add_pipeline(json_db_destination_pipeline, cdc_pipeline)
        sdc_executor.start_pipeline(json_db_destination_pipeline).wait_for_finished()

        preview = sdc_executor.run_pipeline_preview(cdc_pipeline, timeout=30000).preview
        assert preview is not None
        assert preview.issues.issues_count == 0

        # We first assert preview has the default 10 records
        assert len(preview[mapr_db_cdc_consumer].output) == 10

        sdc_executor.start_pipeline(cdc_pipeline)
        sdc_executor.wait_for_pipeline_metric(cdc_pipeline, 'input_record_count', input_records, timeout_sec=90)

        actual_cdc = [record.field for record in wiretap_cdc.output_records]

        # And second that we actually consumed the input_records records, and not 10 less
        assert len(actual_cdc) == input_records
    finally:
        if sdc_executor.get_pipeline_status(cdc_pipeline).response.json().get('status') == 'RUNNING':
            sdc_executor.stop_pipeline(cdc_pipeline)
        logger.info('Deleting MapR-DB table changelog %s ...', f'{stream_name}:{topic_name}')
        cluster.execute_command('table', 'changelog', 'remove', http_request_method='POST',
                                data={'path': table_path, 'changelog': f'{stream_name}:{topic_name}'})
        logger.info('Deleting MapR stream %s ...', stream_name)
        cluster.execute_command('stream', 'delete', http_request_method='POST', data={'path': stream_name})
        logger.info('Deleting MapR-DB table %s ...', table_path)
        cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_path})
Example #4
0
def test_mapr_db_destination(sdc_builder, sdc_executor, cluster):
    """Write a handful of records to the MapR-DB destination and confirm their presence with an HBase client.

    dev_raw_data_source >> mapr_db
    """
    # Generate some data.
    bike_brands = [
        dict(name='Cannondale'),
        dict(name='Specialized'),
        dict(name='Bianchi'),
        dict(name='BMC')
    ]
    raw_data = ''.join(json.dumps(brand) for brand in bike_brands)

    table_name = '/user/sdc/{}'.format(
        get_random_string(string.ascii_letters, 10))

    pipeline_builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON', raw_data=raw_data)

    mapr_db = pipeline_builder.add_stage('MapR DB', type='destination')
    mapr_db.set_attributes(table_name=table_name,
                           row_key='/name',
                           fields=[
                               dict(columnValue='/name',
                                    columnStorageType='TEXT',
                                    columnName='cf1:cq1')
                           ])

    dev_raw_data_source >> mapr_db
    pipeline = pipeline_builder.build().configure_for_environment(cluster)

    try:
        logger.info('Creating MapR-DB table %s ...', table_name)
        cluster.execute_command('table',
                                'create',
                                http_request_method='POST',
                                data={
                                    'path': table_name,
                                    'defaultreadperm': 'p',
                                    'defaultwriteperm': 'p'
                                })
        cluster.execute_command('table',
                                'cf',
                                'create',
                                http_request_method='POST',
                                data={
                                    'path': table_name,
                                    'cfname': 'cf1'
                                })

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(
            len(bike_brands))

        rows = [(key, value) for key, value in cluster.mapr_db.client.table(
            name=table_name).scan()]
        # Bike brands are stored in a list of dicts ('name' => brand). Manipulate this to match what we
        # expect our MapR-DB rows to look like (including putting them in lexicographic order).
        assert sorted((bike_brand['name'].encode(), {
            b'cf1:cq1': bike_brand['name'].encode()
        }) for bike_brand in bike_brands) == rows
    finally:
        logger.info('Deleting MapR-DB table %s ...', table_name)
        cluster.execute_command('table',
                                'delete',
                                http_request_method='POST',
                                data={'path': table_name})
        sdc_executor.stop_pipeline(pipeline)