def test_transform_load_operator(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator upserts data into master db."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Check if the sink table is initially empty
        cursor = postgresql.cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        initial_result = cursor.fetchone()[0]
        assert initial_result == 0

        # Setup task
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        task.execute(context={}, testing=True)
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0
    def test_transform_load_operator_database_error(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator handles DB errors."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup task, intentionally give an unknown table
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert.replace(
                'ports', 'ports_wrong'
            ),
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute the task and check if it will raise an UndefinedTable error
        with raises((UndefinedTable, Exception, OperationalError)):
            # Set testing to false to implicitly close the database
            task.execute(context={}, testing=False)
            task.execute(context={}, testing=True)
    def test_transform_load_operator_exception_error(
        self, mocker, postgresql, ports_collection, test_dag
    ):
        """Test if transform_load_operator handles Exception thrown."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup task
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        task = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query='Wrong SQL query',
            dag=test_dag
        )

        # Execute task and check if it will raise an Exception error
        with raises(Exception):
            task.execute(context={}, testing=True)
Beispiel #4
0
    def test_save_to_json_operator(
        self, mocker, postgresql, ports_collection, test_dag,
        tmp_path: Path
    ):
        """Test if save_to_json_operator saves the file on a specified path"""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup some data, transfer staging data to master
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        transform_load = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        transform_load.execute(context={}, testing=True)
        pg_hook = PostgresHook()
        cursor = pg_hook.get_conn().cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0

        # Alter tmp_path to forcesively create a path
        tmp_path = tmp_path / 'unknown-path'

        # Execute save_to_json to save the data into json file on tmp_path
        save_to_json = LoadToJsonOperator(
            task_id='export_to_json',
            postgres_config=postgres_master_config,
            query=SqlQueries.select_all_query_to_json,
            path=tmp_path,
            tables=['ports'],
            dag=test_dag
        )
        save_to_json.execute(
            {'execution_date': datetime.datetime(2021, 1, 1)}
        )

        output_path = tmp_path / 'ports_20210101T000000.json'

        expected_data = {
            'ports': [{
                'id': 1,
                'countryName': 'Philippines',
                'portName': 'Aleran/Ozamis',
                'unlocode': 'PH ALE',
                'coordinates': '4234N 00135E'
            }]
        }

        # Read result
        with open(output_path, "r") as f:
            result = json.load(f)

        # Assert
        assert 'ports' in result
        assert result == expected_data
Beispiel #5
0
    def test_save_to_json_operator_database_error(
        self, mocker, postgresql, ports_collection, test_dag,
        tmp_path: Path
    ):
        """Test if save_to_json_operator can handle errors related to db."""
        # Create mocks
        mocker.patch.object(
            PostgresHook,
            "get_conn",
            return_value=postgresql
        )
        mocker.patch.object(
            MongoHook,
            "get_collection",
            return_value=ports_collection
        )

        # Check if the source table has an item in it
        mongo_hook = MongoHook()
        collection = mongo_hook.get_collection()
        assert collection.count_documents({}) > 0

        # Setup some data, transfer staging data to master
        mongo_staging_config = MongoConfig('mongo_default', 'ports')
        postgres_master_config = PostgresConfig('postgres_default')
        transform_load = TransformAndLoadOperator(
            mongo_config=mongo_staging_config,
            postgres_config=postgres_master_config,
            task_id='test',
            processor=PortsItemProcessor(),
            query=SqlQueries.ports_table_insert,
            query_params={"updated_at": datetime.datetime.utcnow()},
            dag=test_dag
        )

        # Execute task and check if it inserted the data successfully
        transform_load.execute(context={}, testing=True)
        pg_hook = PostgresHook()
        cursor = pg_hook.get_conn().cursor()
        cursor.execute("SELECT COUNT(*) FROM ports;")
        after_result = cursor.fetchone()[0]
        assert after_result > 0

        # Execute save_to_json to save the data into json file on tmp_path
        save_to_json = LoadToJsonOperator(
            task_id='test2',
            postgres_config=postgres_master_config,
            query=SqlQueries.select_all_query_to_json,
            path=tmp_path,
            tables=['foo'],
            dag=test_dag
        )
        with raises((UndefinedTable, OperationalError, Exception)):
            # Set testing = False to implicitly close the database connection
            save_to_json.execute(
                {'execution_date': datetime.datetime(2021, 1, 1)},
                testing=False
            )
            save_to_json.execute(
                {'execution_date': datetime.datetime(2021, 1, 1)},
                testing=True
            )
Beispiel #6
0
    bash_command="cd ~/airflow/scraper && scrapy check ports_spider",
    dag=dag)

scrape_unece_data = BashOperator(
    task_id="scrape_unece_data",
    bash_command="cd ~/airflow && python scraper/main.py",
    dag=dag)

stage_to_mongodb = StageDatatoMongodbOperator(task_id='stage_to_mongodb',
                                              dag=dag)

transform_and_load_to_postgres = TransformAndLoadOperator(
    mongo_config=mongo_staging_config,
    postgres_config=postgres_master_config,
    task_id='transform_and_load_to_postgres',
    processor=PortsItemProcessor(),
    query=SqlQueries.ports_table_insert,
    query_params={"updated_at": datetime.datetime.utcnow()},
    dag=dag)

check_data_quality = DataQualityCheckOperator(
    tables=['ports'],
    postgres_config=postgres_master_config,
    task_id='check_master_data_quality',
    queries={
        "ports_row_count": SqlQueries.ports_row_count,
        "ports_updated_count": SqlQueries.ports_updated_count
    },
    dag=dag)

load_to_analytics_dw = DummyOperator(task_id='load_to_analytics_dw', dag=dag)