def test_elasticsearch_pipeline_errors(sdc_builder, sdc_executor,
                                       elasticsearch):
    """Test for a pipeline's error records being pumped to Elasticsearch. We do so by making a Dev Raw Data source
    target to Error stage which would send records to the pipeline configured Elasticsearch error records handling.
    We then assert the error records what we find in Elasticsearch. The pipeline would look like:

    Elasticsearch error pipeline:
        dev_raw_data_source >> error_target
    """
    # Test static
    es_index = get_random_string(
        string.ascii_letters,
        10).lower()  # Elasticsearch indexes must be lower case
    es_mapping = get_random_string(string.ascii_letters, 10)
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    errstg = builder.add_error_stage('Write to Elasticsearch')
    errstg.set_attributes(document_id=es_doc_id,
                          index=es_index,
                          mapping=es_mapping)
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              stop_after_first_batch=True,
                                              raw_data=raw_str)
    error_target = builder.add_stage('To Error')

    dev_raw_data_source >> error_target
    es_error_pipeline = builder.build(
        title='ES error pipeline').configure_for_environment(elasticsearch)
    sdc_executor.add_pipeline(es_error_pipeline)

    try:
        elasticsearch.connect()

        # Make sure that the index exists properly before running the test
        index = Index(es_index)
        index.create()
        assert index.refresh()

        # Run pipeline and read from Elasticsearch to assert
        sdc_executor.start_pipeline(es_error_pipeline).wait_for_finished()

        # Since we are upsert on the same index, map, doc - there should only be one document (index 0)
        es_search = ESSearch(index=es_index)
        es_response = _es_search_with_retry(es_search)
        es_meta = es_response[0].meta
        # assert meta ingest
        assert es_meta['index'] == es_index and es_meta[
            'doc_type'] == es_mapping and es_meta['id'] == es_doc_id
        # assert data ingest
        assert raw_str == es_response[0].text
    finally:
        # Clean up test data in ES
        idx = Index(es_index)
        idx.delete()
def test_elasticsearch_target(sdc_builder, sdc_executor, elasticsearch,
                              additional_properties):
    """Test for Elasticsearch target stage. We do so by ingesting data via Dev Raw Data source to
    Elasticsearch stage and then asserting what we ingest to what will be read from Elasticsearch.
    The pipeline looks like:

    Elasticsearch target pipeline:
        dev_raw_data_source >> es_target
    """
    # Test static
    es_index = get_random_string(
        string.ascii_letters,
        10).lower()  # Elasticsearch indexes must be lower case
    es_mapping = get_random_string(string.ascii_letters, 10)
    es_doc_id = get_random_string(string.ascii_letters, 10)
    raw_str = 'Hello World!'

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage(
        'Dev Raw Data Source').set_attributes(data_format='TEXT',
                                              raw_data=raw_str)
    es_target = builder.add_stage('Elasticsearch', type='destination')
    es_target.set_attributes(default_operation='INDEX',
                             document_id=es_doc_id,
                             index=es_index,
                             mapping=es_mapping,
                             additional_properties=additional_properties)

    dev_raw_data_source >> es_target
    es_target_pipeline = builder.build(
        title='ES target pipeline').configure_for_environment(elasticsearch)
    es_target_pipeline.configuration["shouldRetry"] = False

    sdc_executor.add_pipeline(es_target_pipeline)

    try:
        # Run pipeline and read from Elasticsearch to assert
        sdc_executor.start_pipeline(
            es_target_pipeline).wait_for_pipeline_batch_count(1)
        sdc_executor.stop_pipeline(es_target_pipeline)

        # Since we are upsert on the same index, map, doc - there should only be one document (index 0)
        elasticsearch.connect()
        es_search = ESSearch(index=es_index)
        es_response = es_search.execute()
        es_meta = es_response[0].meta
        # assert meta ingest
        assert es_meta['index'] == es_index and es_meta[
            'doc_type'] == es_mapping and es_meta['id'] == es_doc_id
        # assert data ingest
        assert raw_str == es_response[0].text
    finally:
        # Clean up test data in ES
        idx = Index(es_index)
        idx.delete()
Example #3
0
def test_elasticsearch_target_additional_properties(sdc_builder, sdc_executor, elasticsearch):
    """
    Elasticsearch target pipeline, adding additional properties, where specifies every routing with the value of the
    shard's record. It checks if the value of the record-label is added correctly to the property routing at
    ElasticSearch query.
        dev_raw_data_source >> es_target
    """
    # Test static
    index_values = []
    for j in range(4):
        index_values.append(get_random_string(string.ascii_letters, 10).lower())

    raw_data = [{"text": "Record1", "index": index_values[0], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record1"},
                {"text": "Record2", "index": index_values[1], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record2"},
                {"text": "Record3", "index": index_values[2], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": "record3"},
                {"text": "Record4", "index": index_values[3], "mapping": get_random_string(string.ascii_letters, 10).lower(),
                 "doc_id": get_random_string(string.ascii_letters, 10).lower(), "shard": None}]

    # Build pipeline
    builder = sdc_builder.get_pipeline_builder()
    dev_raw_data_source = builder.add_stage('Dev Raw Data Source').set_attributes(data_format='JSON',
                                                                                  stop_after_first_batch=True,
                                                                                  raw_data='\n'.join(json.dumps(rec)
                                                                                                     for rec in raw_data))
    es_target = builder.add_stage('Elasticsearch', type='destination')
    es_target.set_attributes(default_operation='INDEX', document_id='${record:value(\'/doc_id\')}',
                             index='${record:value(\'/index\')}', mapping='${record:value(\'/mapping\')}',
                             additional_properties='{\"_routing\":${record:value(\'/shard\')}}')

    dev_raw_data_source >> es_target
    es_target_pipeline = builder.build(title='ES target pipeline').configure_for_environment(elasticsearch)

    sdc_executor.add_pipeline(es_target_pipeline)
    try:
        elasticsearch.connect()

        # Make sure that the index exists properly before running the test
        index = Index(index_values[0])
        index.create()
        assert index.refresh()

        # Run pipeline with additional properties
        sdc_executor.start_pipeline(es_target_pipeline).wait_for_finished()

        es_response = []
        for i in index_values:
            es_search = ESSearch(index=i)
            response = es_search.execute()
            es_response.append(response[0])
            time.sleep(5)

        assert len(es_response) == 4
        for r in es_response:
            assert r
            if r.text == "Record4":
                for attribute in r.meta:
                    assert attribute != "routing"
            else:
                assert r.shard == r.meta.routing

    finally:
        # Clean up test data in ES
        idx = Index(index_values[0])
        idx.delete()