Beispiel #1
0
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "trino") as docker_services:
        wait_for_port(docker_services, "testtrino", 8080)
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        docker_services.wait_until_responsive(
            timeout=30,
            pause=1,
            check=lambda: requests.get("http://localhost:5300/v1/info").json()[
                "starting"] is False,
        )

        # Set up the hive db
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            print(tmp_path)

            # Run the metadata ingestion pipeline for trino catalog referring to postgres database
            config_file = (test_resources_dir / "trino_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_mces.json",
                golden_path=test_resources_dir / "trino_mces_golden.json",
            )

            # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
            # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070

            # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
            # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

            # Run the metadata ingestion pipeline for trino catalog referring to hive database
            config_file = (test_resources_dir /
                           "trino_hive_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_hive_mces.json",
                golden_path=test_resources_dir / "trino_hive_mces_golden.json",
                ignore_paths=[
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
                ],
            )
Beispiel #2
0
def test_ldap_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "ldap") as docker_services:
        # The openldap container loads the sample data after exposing the port publicly. As such,
        # we must wait a little bit extra to ensure that the sample data is loaded.
        wait_for_port(docker_services, "openldap", 389)
        time.sleep(5)

        pipeline = Pipeline.create({
            "run_id": "ldap-test",
            "source": {
                "type": "ldap",
                "config": {
                    "ldap_server": "ldap://localhost",
                    "ldap_user": "******",
                    "ldap_password": "******",
                    "base_dn": "dc=example,dc=org",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/ldap_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        output = mce_helpers.load_json_file(str(tmp_path / "ldap_mces.json"))
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "ldap_mces_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
Beispiel #3
0
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/hive"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "hive") as docker_services:
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        # Set up the container.
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "hive_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "hive_mces.json",
            golden_path=test_resources_dir / "hive_mces_golden.json",
            ignore_paths=[
                # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:']
                # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime']
                r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]",
                r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]",
            ],
        )
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path,
                        mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "mongo") as docker_services:
        wait_for_port(docker_services, "testmongodb", 27017)

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create({
            "run_id": "mongodb-test",
            "source": {
                "type": "mongodb",
                "config": {
                    "connect_uri": "mongodb://localhost:57017",
                    "username": "******",
                    "password": "******",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/mongodb_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "mongodb_mces.json",
            golden_path=test_resources_dir / "mongodb_mces_golden.json",
        )
Beispiel #5
0
def test_mysql_ingest_with_db_alias(
    docker_compose_runner, pytestconfig, tmp_path, mock_time
):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "mysql"
    ) as docker_services:
        wait_for_port(docker_services, "testmysql", 3306)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "mysql_to_file_dbalias.yml").resolve()
        run_datahub_cmd(
            ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path
        )

        # Verify the output.
        # Assert that all events generated have instance specific urns
        import re

        urn_pattern = "^" + re.escape(
            "urn:li:dataset:(urn:li:dataPlatform:mysql,foogalaxy."
        )
        mce_helpers.assert_mcp_entity_urn(
            filter="ALL",
            entity_type="dataset",
            regex_pattern=urn_pattern,
            file=tmp_path / "mysql_mces_dbalias.json",
        )
Beispiel #6
0
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", 9092, timeout=120)

        # Set up topics and produce some data
        command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_mces.json",
            golden_path=test_resources_dir / "kafka_mces_golden.json",
            ignore_paths=[],
        )
Beispiel #7
0
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", 59092, timeout=120)
        wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120)

        # Set up topics and produce some data
        command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_mces.json",
            golden_path=test_resources_dir / "kafka_mces_golden.json",
            ignore_paths=[],
        )
def test_feast_ingest(docker_compose_runner, pytestconfig, tmp_path):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/feast"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "feast") as docker_services:
        wait_for_port(docker_services, "testfeast", 6565)

        # container listens to this port once test cases have been setup
        wait_for_port(docker_services, "testfeast_setup", 6789)

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create({
            "run_id": "feast-test",
            "source": {
                "type": "feast",
                "config": {
                    "core_url": "localhost:6565",
                    "use_local_build": True,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/feast_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        # Verify the output.
        output = mce_helpers.load_json_file(str(tmp_path / "feast_mces.json"))
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "feast_mce_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
Beispiel #9
0
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "sql-server") as docker_services:
        # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available
        # does not mean the server is accepting connections.
        # TODO: find a better way to check for liveness.
        wait_for_port(docker_services, "testsqlserver", 1433)
        time.sleep(5)

        # Run the setup.sql file to populate the database.
        docker = "docker"
        command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql"
        ret = subprocess.run(command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        assert ret.returncode == 0

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "mssql_to_file.yml").resolve()
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

            output = mce_helpers.load_json_file("mssql_mces.json")

        # Verify the output.
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "mssql_mce_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
Beispiel #10
0
def test_ge_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time,
                   **kwargs):

    test_resources_dir = pytestconfig.rootpath / "tests/integration/great-expectations"

    with docker_compose_runner(
            test_resources_dir / "docker-compose.yml",
            "great-expectations") as docker_services, mock.patch(
                "datahub.emitter.rest_emitter.DatahubRestEmitter.emit_mcp"
            ) as mock_emit_mcp:
        wait_for_port(docker_services, "ge_postgres", 5432)

        emitter = MockDatahubEmitter("")
        mock_emit_mcp.side_effect = emitter.emit_mcp

        shutil.copytree(
            test_resources_dir / "setup/great_expectations",
            tmp_path / "great_expectations",
        )
        context = ge.DataContext.create(tmp_path)
        context.run_checkpoint(checkpoint_name="test_checkpoint")

        emitter.write_to_file(tmp_path / "ge_mcps.json")

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "ge_mcps.json",
            golden_path=test_resources_dir / "ge_mcps_golden.json",
            ignore_paths=[],
        )
Beispiel #11
0
def trino_runner(docker_compose_runner, pytestconfig):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"
    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "trino") as docker_services:
        wait_for_port(docker_services, "testtrino", 8080)
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)
        docker_services.wait_until_responsive(
            timeout=30,
            pause=1,
            check=lambda: requests.get("http://localhost:5300/v1/info").json()[
                "starting"] is False,
        )

        yield docker_services
Beispiel #12
0
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "mysql"
    ) as docker_services:
        wait_for_port(docker_services, "testmysql", 3306)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "mysql_to_file.yml").resolve()
        run_datahub_cmd(
            ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path
        )

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "mysql_mces.json",
            golden_path=test_resources_dir / "mysql_mces_golden.json",
        )
Beispiel #13
0
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "mysql") as docker_services:
        wait_for_port(docker_services, "testmysql", 3306)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "mysql_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="mysql_mces.json",
                golden_path=test_resources_dir / "mysql_mces_golden.json",
            )
Beispiel #14
0
def hive_runner(docker_compose_runner, pytestconfig):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/hive"
    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "hive") as docker_services:
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)
        yield docker_services
def test_kafka_connect_ingest(docker_compose_runner, pytestconfig, tmp_path,
                              mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka-connect"
    test_resources_dir_kafka = pytestconfig.rootpath / "tests/integration/kafka"

    # Share Compose configurations between files and projects
    # https://docs.docker.com/compose/extends/
    docker_compose_file = [
        str(test_resources_dir_kafka / "docker-compose.yml"),
        str(test_resources_dir / "docker-compose.override.yml"),
    ]
    with docker_compose_runner(docker_compose_file,
                               "kafka-connect") as docker_services:
        wait_for_port(docker_services, "test_broker", 59092, timeout=120)
        wait_for_port(docker_services, "test_connect", 58083, timeout=120)
        docker_services.wait_until_responsive(
            timeout=30,
            pause=1,
            check=lambda: requests.get("http://localhost:58083/connectors", ).
            status_code == 200,
        )
        # Creating MySQL source with no transformations , only topic prefix
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "mysql_source1",
                        "config": {
                            "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector",
                            "mode": "incrementing",
                            "incrementing.column.name": "id",
                            "topic.prefix": "test-mysql-jdbc-",
                            "tasks.max": "1",
                            "connection.url": "${env:MYSQL_CONNECTION_URL}"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created
        # Creating MySQL source with regex router transformations , only topic prefix
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "mysql_source2",
                        "config": {
                            "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector",
                            "mode": "incrementing",
                            "incrementing.column.name": "id",
                            "tasks.max": "1",
                            "connection.url": "${env:MYSQL_CONNECTION_URL}",
                            "transforms": "TotalReplacement",
                            "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter",
                            "transforms.TotalReplacement.regex": ".*(book)",
                            "transforms.TotalReplacement.replacement": "my-new-topic-$1"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created
        # Creating MySQL source with regex router transformations , no topic prefix, table whitelist
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "mysql_source3",
                        "config": {
                            "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector",
                            "mode": "incrementing",
                            "incrementing.column.name": "id",
                            "table.whitelist": "book",
                            "tasks.max": "1",
                            "connection.url": "${env:MYSQL_CONNECTION_URL}",
                            "transforms": "TotalReplacement",
                            "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter",
                            "transforms.TotalReplacement.regex": ".*",
                            "transforms.TotalReplacement.replacement": "my-new-topic"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created
        # Creating MySQL source with query , topic prefix
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "mysql_source4",
                        "config": {
                            "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector",
                            "mode": "incrementing",
                            "incrementing.column.name": "id",
                            "query": "select * from member",
                            "topic.prefix": "query-topic",
                            "tasks.max": "1",
                            "connection.url": "${env:MYSQL_CONNECTION_URL}"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created
        # Creating MySQL source with ExtractTopic router transformations - source dataset not added
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                    "name": "mysql_source5",
                    "config": {
                        "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector",
                        "mode": "incrementing",
                        "incrementing.column.name": "id",
                        "table.whitelist": "book",
                        "topic.prefix": "test-mysql-jdbc2-",
                        "tasks.max": "1",
                        "connection.url": "${env:MYSQL_CONNECTION_URL}",
                        "transforms": "changetopic",
                        "transforms.changetopic.type": "io.confluent.connect.transforms.ExtractTopic$Value",
                        "transforms.changetopic.field": "name"
                    }
                }
                """,
        )
        assert r.status_code == 201  # Created
        # Creating MySQL sink connector - not added
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "mysql_sink",
                        "config": {
                            "connector.class": "io.confluent.connect.jdbc.JdbcSinkConnector",
                            "insert.mode": "insert",
                            "auto.create": true,
                            "topics": "my-topic",
                            "tasks.max": "1",
                            "connection.url": "${env:MYSQL_CONNECTION_URL}"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created

        # Creating Debezium MySQL source connector
        r = requests.post(
            "http://localhost:58083/connectors",
            headers={"Content-Type": "application/json"},
            data="""{
                        "name": "debezium-mysql-connector",
                        "config": {
                            "name": "debezium-mysql-connector",
                            "connector.class": "io.debezium.connector.mysql.MySqlConnector",
                            "database.hostname": "test_mysql",
                            "database.port": "3306",
                            "database.user": "******",
                            "database.password": "******",
                            "database.server.name": "debezium.topics",
                            "database.history.kafka.bootstrap.servers": "test_broker:9092",
                            "database.history.kafka.topic": "dbhistory.debeziummysql",
                            "include.schema.changes": "false"
                        }
                    }
                    """,
        )
        assert r.status_code == 201  # Created

        # Give time for connectors to process the table data
        time.sleep(45)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir /
                       "kafka_connect_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_connect_mces.json",
            golden_path=test_resources_dir / "kafka_connect_mces_golden.json",
            ignore_paths=[],
        )
def test_kafka_ingest_with_stateful(
    docker_compose_runner, pytestconfig, tmp_path, mock_time, mock_datahub_graph
):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"
    topic_prefix: str = "stateful_ingestion_test"
    topic_names: List[str] = [f"{topic_prefix}_t1", f"{topic_prefix}_t2"]
    platform_instance = "test_platform_instance_1"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", KAFKA_PORT, timeout=120)
        wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120)

        source_config_dict: Dict[str, Any] = {
            "connection": {
                "bootstrap": KAFKA_BOOTSTRAP_SERVER,
            },
            "platform_instance": f"{platform_instance}",
            # enable stateful ingestion
            "stateful_ingestion": {
                "enabled": True,
                "remove_stale_metadata": True,
                "state_provider": {
                    "type": "datahub",
                    "config": {"datahub_api": {"server": GMS_SERVER}},
                },
            },
        }

        pipeline_config_dict: Dict[str, Any] = {
            "source": {
                "type": "kafka",
                "config": source_config_dict,
            },
            "sink": {
                # we are not really interested in the resulting events for this test
                "type": "console"
            },
            "pipeline_name": "test_pipeline",
            # enable reporting
            "reporting": [
                {
                    "type": "datahub",
                    "config": {"datahub_api": {"server": GMS_SERVER}},
                }
            ],
        }

        # topics will be automatically created and deleted upon test completion
        with KafkaTopicsCxtManager(
            topic_names, KAFKA_BOOTSTRAP_SERVER
        ) as kafka_ctx, patch(
            "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
            mock_datahub_graph,
        ) as mock_checkpoint, patch(
            "datahub.ingestion.reporting.datahub_ingestion_reporting_provider.DataHubGraph",
            mock_datahub_graph,
        ) as mock_reporting:

            # both checkpoint and reporting will use the same mocked graph instance
            mock_checkpoint.return_value = mock_datahub_graph
            mock_reporting.return_value = mock_datahub_graph

            # 1. Do the first run of the pipeline and get the default job's checkpoint.
            pipeline_run1 = run_and_get_pipeline(pipeline_config_dict)
            checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)

            assert checkpoint1
            assert checkpoint1.state

            # 2. Drop the first topic created during step 1 + rerun the pipeline and get the checkpoint state.
            kafka_ctx.delete_kafka_topics([kafka_ctx.topics[0]])
            # sleep to guarantee eventual consistency for kafka topic deletion
            time.sleep(1)
            pipeline_run2 = run_and_get_pipeline(pipeline_config_dict)
            checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)

            assert checkpoint2
            assert checkpoint2.state

            # 3. Perform all assertions on the states. The deleted topic should not be
            #    part of the second state
            state1 = cast(KafkaCheckpointState, checkpoint1.state)
            state2 = cast(KafkaCheckpointState, checkpoint2.state)
            difference_urns = list(state1.get_topic_urns_not_in(state2))

            assert len(difference_urns) == 1
            assert (
                difference_urns[0]
                == f"urn:li:dataset:(urn:li:dataPlatform:kafka,{platform_instance}.{kafka_ctx.topics[0]},PROD)"
            )

            # 4. Checkpoint configuration should be the same.
            assert checkpoint1.config == checkpoint2.config

            # 5. Validate that all providers have committed successfully.
            # NOTE: The following validation asserts for presence of state as well
            # and validates reporting.
            validate_all_providers_have_committed_successfully(
                pipeline=pipeline_run1, expected_providers=2
            )
            validate_all_providers_have_committed_successfully(
                pipeline=pipeline_run1, expected_providers=2
            )
Beispiel #17
0
def test_nifi_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/nifi"
    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "nifi") as docker_services:
        wait_for_port(
            docker_services,
            container_name="nifi1",
            container_port=9443,
            timeout=300,
        )
        wait_for_port(
            docker_services,
            container_name="nifi01",
            container_port=9080,
            timeout=60,
        )
        wait_for_port(
            docker_services,
            container_name="nifi02",
            container_port=9081,
            timeout=60,
        )
        wait_for_port(
            docker_services,
            container_name="nifi03",
            container_port=9082,
            timeout=60,
        )

        # Wait for nifi to execute all processors
        time.sleep(120)

        # Run the metadata ingestion pipeline.
        with fs_helpers.isolated_filesystem(tmp_path):

            # Run nifi ingestion run.
            pipeline = Pipeline.create({
                "run_id": "nifi-test-standalone",
                "source": {
                    "type": "nifi",
                    "config": {
                        "site_url": "http://localhost:9443/nifi/",
                        #                        "auth": "CLIENT_CERT",
                        #                        "client_cert_file": f"{test_resources_dir}/setup/ssl_files/client-cert.pem",
                        #                        "client_key_file": f"{test_resources_dir}/setup/ssl_files/client-private-key.pem",
                        #                        "client_key_password": "******",
                        #                        "ca_file": f"{test_resources_dir}/setup/ssl_files/server_certfile.pem",
                        "process_group_pattern": {
                            "deny": ["^WIP"]
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": "./nifi_mces.json"
                    },
                },
            })
            pipeline.run()
            pipeline.raise_from_status()

            # Verify the output. ignore values for aspects having last_event_time values
            # TODO: ignore paths with respect to aspect value in case of MCPs
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="nifi_mces.json",
                golden_path=test_resources_dir /
                "nifi_mces_golden_standalone.json",
                ignore_paths=[
                    r"root\[1\]\['aspect'\]\['value'\]",
                    r"root\[5\]\['aspect'\]\['value'\]",
                    r"root\[7\]\['aspect'\]\['value'\]",
                ],
            )

            # Run nifi ingestion run.
            pipeline = Pipeline.create({
                "run_id": "nifi-test-cluster",
                "source": {
                    "type": "nifi",
                    "config": {
                        "site_url": "http://localhost:9080/nifi/",
                        "auth": "NO_AUTH",
                        "site_url_to_site_name": {
                            "http://nifi01:9080/nifi/": "default",
                            "http://nifi02:9081/nifi/": "default",
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": "./nifi_mces_cluster.json"
                    },
                },
            })
            pipeline.run()
            pipeline.raise_from_status()

            # Verify the output.
            # TODO: ignore paths with respect to aspect value in case of MCPs
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="nifi_mces_cluster.json",
                golden_path=test_resources_dir /
                "nifi_mces_golden_cluster.json",
                ignore_paths=[
                    r"root\[5\]\['aspect'\]\['value'\]",
                    r"root\[7\]\['aspect'\]\['value'\]",
                    r"root\[15\]\['aspect'\]\['value'\]",
                    r"root\[19\]\['aspect'\]\['value'\]",
                ],
            )