Ejemplo n.º 1
0
    def test_dags_with_invalid_tasks(self):
        with pytest.raises(InvalidDag):
            query_file = (TEST_DIR / "data" / "test_sql" / "test" /
                          "incremental_query_v1" / "query.sql")

            metadata = Metadata(
                "test",
                "test",
                ["*****@*****.**"],
                {},
                {
                    "dag_name": "bqetl_non_exisiting_dag",
                    "depends_on_past": True,
                    "param": "test_param",
                },
            )

            tasks = [Task.of_query(query_file, metadata)]

            DagCollection.from_dict({
                "bqetl_test_dag": {
                    "schedule_interval": "daily",
                    "default_args": self.default_args,
                }
            }).with_tasks(tasks)
Ejemplo n.º 2
0
    def test_to_airflow_duplicate_dependencies(self, tmp_path):
        query_file = (
            TEST_DIR
            / "data"
            / "test_sql"
            / "moz-fx-data-test-project"
            / "test"
            / "non_incremental_query_v1"
            / "query.sql"
        )

        query_file2 = (
            TEST_DIR
            / "data"
            / "test_sql"
            / "moz-fx-data-test-project"
            / "test"
            / "no_metadata_query_v1"
            / "query.sql"
        )

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "depends_on_past": True,
                "depends_on": [{"dag_name": "external", "task_id": "task1"}],
            },
        )

        tasks = [
            Task.of_query(query_file, metadata),
            Task.of_query(query_file2, metadata),
        ]

        default_args = {
            "owner": "*****@*****.**",
            "start_date": "2020-01-01",
        }
        dags = DagCollection.from_dict(
            {
                "bqetl_test_dag": {
                    "schedule_interval": "daily",
                    "default_args": default_args,
                }
            }
        ).with_tasks(tasks)

        dags.to_airflow_dags(tmp_path)
        result = (tmp_path / "bqetl_test_dag.py").read_text().strip()
        expected = (
            (TEST_DIR / "data" / "dags" / "test_dag_duplicate_dependencies")
            .read_text()
            .strip()
        )

        assert result == expected
Ejemplo n.º 3
0
    def test_public_data_json_dag_add_task(self):
        public_json_dag = PublicDataJsonDag("bqetl_public_data_json_dag",
                                            "daily", self.default_args)

        assert len(public_json_dag.tasks) == 0

        query_file = (TEST_DIR / "data" / "test_sql" /
                      "moz-fx-data-test-project" / "test" /
                      "incremental_query_v1" / "query.sql")

        task = Task.of_query(query_file)
        dag = Dag(
            "bqetl_events",
            "0 1 * * *",
            DagDefaultArgs("*****@*****.**", "2020-01-01"),
            [task],
        )
        public_json_dag.add_export_tasks([task], DagCollection([dag]))

        assert len(public_json_dag.tasks) == 1
        assert (public_json_dag.tasks[0].task_name ==
                "export_public_data_json_test__incremental_query__v1")
        assert public_json_dag.tasks[
            0].dag_name == "bqetl_public_data_json_dag"
        assert len(public_json_dag.tasks[0].dependencies) == 1
Ejemplo n.º 4
0
    def test_dags_with_tasks(self):
        query_file = (TEST_DIR / "data" / "test_sql" / "test" /
                      "incremental_query_v1" / "query.sql")

        metadata = Metadata(
            "test",
            "test",
            {},
            {
                "dag_name": "test_dag",
                "depends_on_past": True,
                "param": "test_param"
            },
        )

        tasks = [Task(query_file, metadata)]

        dags = DagCollection.from_dict({
            "test_dag": {
                "schedule_interval": "daily",
                "default_args": {}
            }
        }).with_tasks(tasks)

        assert len(dags.dags) == 1

        dag = dags.dag_by_name("test_dag")
        assert len(dag.tasks) == 1
        assert dag.tasks[0].dag_name == "test_dag"
Ejemplo n.º 5
0
    def test_public_json_dag_to_airflow(self, tmp_path):
        query_file = (TEST_DIR / "data" / "test_sql" / "test" /
                      "non_incremental_query_v1" / "query.sql")

        tasks = [Task.of_query(query_file)]

        default_args = {
            "depends_on_past": False,
            "owner": "*****@*****.**",
            "email": ["*****@*****.**"],
            "start_date": "2020-01-01",
            "retry_delay": "1h",
        }

        dags = DagCollection.from_dict({
            "bqetl_public_data_json": {
                "schedule_interval": "daily",
                "default_args": default_args,
            },
            "bqetl_core": {
                "schedule_interval": "daily",
                "default_args": default_args,
            },
        }).with_tasks(tasks)

        dags.to_airflow_dags(tmp_path)
        result = (tmp_path / "bqetl_public_data_json.py").read_text().strip()
        expected_dag = ((TEST_DIR / "data" / "dags" /
                         "test_public_data_json_dag").read_text().strip())

        assert result == expected_dag
Ejemplo n.º 6
0
    def test_dags_with_tasks(self):
        query_file = (TEST_DIR / "data" / "test_sql" /
                      "moz-fx-data-test-project" / "test" /
                      "incremental_query_v1" / "query.sql")

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "depends_on_past": True
            },
        )

        tasks = [Task.of_query(query_file, metadata)]

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": self.default_args,
            }
        }).with_tasks(tasks)

        assert len(dags.dags) == 1

        dag = dags.dag_by_name("bqetl_test_dag")
        assert len(dag.tasks) == 1
        assert dag.tasks[0].dag_name == "bqetl_test_dag"
Ejemplo n.º 7
0
    def test_dags_from_dict(self):
        dags = DagCollection.from_dict({
            "test_dag1": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**"
                },
            },
            "test_dag2": {
                "schedule_interval": "daily",
                "default_args": {}
            },
        })

        assert len(dags.dags) == 2
        assert dags.dag_by_name("test_dag1") is not None
        assert dags.dag_by_name("test_dag2") is not None

        dag1 = dags.dag_by_name("test_dag1")
        assert len(dag1.tasks) == 0
        assert dag1.schedule_interval == "daily"
        assert dag1.default_args == {"owner": "*****@*****.**"}

        dag2 = dags.dag_by_name("test_dag2")
        assert len(dag2.tasks) == 0
        assert dag2.schedule_interval == "daily"
        assert dag2.default_args == {}
Ejemplo n.º 8
0
def get_dags(sql_dir, dags_config):
    """Return all configured DAGs including associated tasks."""
    tasks = []

    # parse metadata.yaml to retrieve scheduling information
    if os.path.isdir(sql_dir):
        for root, dirs, files in os.walk(sql_dir):
            if QUERY_FILE in files:
                query_file = os.path.join(root, QUERY_FILE)

                try:
                    task = Task.of_query(query_file)
                    tasks.append(task)
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    # logging.debug(
                    #     f"No scheduling information for {query_file}."
                    # )
                    #
                    # most tasks lack scheduling information for now
                    pass

    else:
        logging.error("""
            Invalid sql_dir: {}, sql_dir must be a directory with
            structure /<dataset>/<table>/metadata.yaml.
            """.format(sql_dir))

    return DagCollection.from_file(dags_config).with_tasks(tasks)
Ejemplo n.º 9
0
    def test_task_for_table(self):
        query_file = (TEST_DIR / "data" / "test_sql" / "test" /
                      "incremental_query_v1" / "query.sql")

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "depends_on_past": True
            },
        )

        tasks = [Task.of_query(query_file, metadata)]

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": self.default_args,
            }
        }).with_tasks(tasks)

        task = dags.task_for_table("test", "incremental_query_v1")

        assert task
        assert task.dag_name == "bqetl_test_dag"
Ejemplo n.º 10
0
    def test_task_get_nested_view_dependencies(
        self, tmp_path, bigquery_client, project_id, temporary_dataset
    ):
        query_file_path = tmp_path / "sql" / temporary_dataset / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1 "
            + f"UNION ALL SELECT * FROM {project_id}.{temporary_dataset}.test_view"
        )

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(
            f"{project_id}.{temporary_dataset}.table1_v1", schema=schema
        )
        bigquery_client.create_table(table)
        table = bigquery.Table(
            f"{project_id}.{temporary_dataset}.table2_v1", schema=schema
        )
        bigquery_client.create_table(table)
        view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view2")
        view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1"
        bigquery_client.create_table(view)
        view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view")
        view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.test_view2"
        bigquery_client.create_table(view)

        metadata = Metadata(
            "test", "test", ["*****@*****.**"], {}, self.default_scheduling
        )

        task = Task.of_query(query_file, metadata)

        table_task1 = Task.of_query(
            tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql", metadata
        )
        table_task2 = Task.of_query(
            tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql", metadata
        )

        dags = DagCollection.from_dict(
            {
                "bqetl_test_dag": {
                    "schedule_interval": "daily",
                    "default_args": {
                        "owner": "*****@*****.**",
                        "start_date": "2020-01-01",
                    },
                }
            }
        ).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(bigquery_client, dags)
        result = task.dependencies
        tables = [f"{t.dataset}__{t.table}__{t.version}" for t in result]

        assert f"{temporary_dataset}__table1__v1" in tables
        assert f"{temporary_dataset}__table2__v1" in tables
Ejemplo n.º 11
0
    def test_task_for_non_existing_table(self):
        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": self.default_args,
            }
        }).with_tasks([])

        assert dags.task_for_table("test", "non_existing_table") is None
Ejemplo n.º 12
0
    def test_dag_by_name(self):
        dags = DagCollection.from_dict(
            {"test_dag1": {
                "schedule_interval": "daily",
                "default_args": {}
            }})

        assert dags.dag_by_name("test_dag1") is not None
        assert dags.dag_by_name("test_dag1").name == "test_dag1"
        assert dags.dag_by_name("non_existing") is None
Ejemplo n.º 13
0
def get_dags(project_id, dags_config):
    """Return all configured DAGs including associated tasks."""
    tasks = []
    dag_collection = DagCollection.from_file(dags_config)

    for project_dir in project_dirs(project_id):
        # parse metadata.yaml to retrieve scheduling information
        if os.path.isdir(project_dir):
            for root, dirs, files in os.walk(project_dir):
                try:
                    if QUERY_FILE in files:
                        query_file = os.path.join(root, QUERY_FILE)
                        task = Task.of_query(query_file,
                                             dag_collection=dag_collection)
                    elif QUERY_PART_FILE in files:
                        # multipart query
                        query_file = os.path.join(root, QUERY_PART_FILE)
                        task = Task.of_multipart_query(
                            query_file, dag_collection=dag_collection)
                    elif SCRIPT_FILE in files:
                        query_file = os.path.join(root, SCRIPT_FILE)
                        task = Task.of_script(query_file,
                                              dag_collection=dag_collection)
                    elif PYTHON_SCRIPT_FILE in files:
                        query_file = os.path.join(root, PYTHON_SCRIPT_FILE)
                        task = Task.of_python_script(
                            query_file, dag_collection=dag_collection)
                    else:
                        continue
                except FileNotFoundError:
                    # query has no metadata.yaml file; skip
                    pass
                except UnscheduledTask:
                    # logging.debug(
                    #     f"No scheduling information for {query_file}."
                    # )
                    #
                    # most tasks lack scheduling information for now
                    pass
                except Exception as e:
                    # in the case that there was some other error, report the query
                    # that failed before exiting
                    logging.error(
                        f"Error processing task for query {query_file}")
                    raise e
                else:
                    tasks.append(task)

        else:
            logging.error("""
                Invalid project_dir: {}, project_dir must be a directory with
                structure <sql>/<project>/<dataset>/<table>/metadata.yaml.
                """.format(project_dir))

    return dag_collection.with_tasks(tasks)
Ejemplo n.º 14
0
    def test_task_get_view_dependencies(self, tmp_path, bigquery_client,
                                        project_id, temporary_dataset):
        query_file_path = tmp_path / "sql" / temporary_dataset / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1 " +
            f"UNION ALL SELECT * FROM {project_id}.{temporary_dataset}.test_view"
        )

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1",
                               schema=schema)
        bigquery_client.create_table(table)
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1",
                               schema=schema)
        bigquery_client.create_table(table)
        view = bigquery.Table(f"{project_id}.{temporary_dataset}.test_view")
        view.view_query = f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1"
        bigquery_client.create_table(view)

        metadata = Metadata(
            "test",
            "test",
            {},
            {
                "dag_name": "test_dag",
                "depends_on_past": True,
                "param": "test_param"
            },
        )

        task = Task(query_file, metadata)

        table_task1 = Task(
            tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql",
            metadata)
        table_task2 = Task(
            tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql",
            metadata)

        dags = DagCollection.from_dict({
            "test_dag": {
                "schedule_interval": "daily",
                "default_args": {}
            }
        }).with_tasks([task, table_task1, table_task2])

        result = task.get_dependencies(bigquery_client, dags)

        tables = [f"{t.dataset}__{t.table}__{t.version}" for t in result]

        assert f"{temporary_dataset}__table1__v1" in tables
        assert f"{temporary_dataset}__table2__v1" in tables
Ejemplo n.º 15
0
    def test_task_get_nested_view_dependencies(self, tmp_path):
        query_file_path = tmp_path / "test-project" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text(
            "SELECT * FROM `test-project`.test.table1_v1 "
            "UNION ALL SELECT * FROM `test-project`.test.test_view")

        view_file_path = tmp_path / "test-project" / "test" / "test_view"
        os.makedirs(view_file_path)

        view_file = view_file_path / "view.sql"
        view_file.write_text(
            "CREATE OR REPLACE VIEW `test-project`.test.test_view "
            "AS SELECT * FROM `test-project`.test.test_view2")

        view2_file_path = tmp_path / "test-project" / "test" / "test_view2"
        os.makedirs(view2_file_path)

        view2_file = view2_file_path / "view.sql"
        view2_file.write_text(
            "CREATE OR REPLACE VIEW `test-project`.test.test_view2 "
            "AS SELECT * FROM `test-project`.test.table2_v1")

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_query(query_file, metadata)

        table_task1 = Task.of_query(
            tmp_path / "test-project" / "test" / "table1_v1" / "query.sql",
            metadata,
        )
        table_task2 = Task.of_query(
            tmp_path / "test-project" / "test" / "table2_v1" / "query.sql",
            metadata,
        )

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-01-01",
                },
            }
        }).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(dags)
        result = task.dependencies
        tables = [t.task_id for t in result]

        assert "test__table1__v1" in tables
        assert "test__table2__v1" in tables
Ejemplo n.º 16
0
    def test_multipart_task_get_dependencies(self, tmp_path, bigquery_client,
                                             project_id, temporary_dataset):
        query_file_path = tmp_path / project_id / temporary_dataset / "query_v1"
        os.makedirs(query_file_path)

        query_file_part1 = query_file_path / "part1.sql"
        query_file_part1.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1")

        query_file_part2 = query_file_path / "part2.sql"
        query_file_part2.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table2_v1")

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1",
                               schema=schema)
        bigquery_client.create_table(table)
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1",
                               schema=schema)
        bigquery_client.create_table(table)

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_multipart_query(query_file_part1, metadata)

        table_task1 = Task.of_query(
            tmp_path / project_id / temporary_dataset / "table1_v1" /
            "query.sql",
            metadata,
        )
        table_task2 = Task.of_query(
            tmp_path / project_id / temporary_dataset / "table2_v1" /
            "query.sql",
            metadata,
        )

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-01-01",
                },
            }
        }).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(dags)
        result = task.dependencies

        tables = [t.task_id for t in result]

        assert f"{temporary_dataset}__table1__v1" in tables
        assert f"{temporary_dataset}__table2__v1" in tables
Ejemplo n.º 17
0
    def test_dags_from_file(self):
        dags_file = TEST_DIR / "data" / "dags.yaml"
        dags = DagCollection.from_file(dags_file)

        assert len(dags.dags) == 2
        assert dags.dag_by_name("not existing") is None
        assert dags.dag_by_name("bqetl_events") is not None
        assert dags.dag_by_name("bqetl_core") is not None

        events_dag = dags.dag_by_name("bqetl_events")
        assert len(events_dag.tasks) == 0

        core_dag = dags.dag_by_name("bqetl_core")
        assert len(core_dag.tasks) == 0
Ejemplo n.º 18
0
    def test_task_get_dependencies_none(self, tmp_path, bigquery_client):
        query_file_path = tmp_path / "sql" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text("SELECT 123423")

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_query(query_file, metadata)
        dags = DagCollection.from_dict({})
        task.with_dependencies(bigquery_client, dags)
        assert task.dependencies == []
Ejemplo n.º 19
0
    def test_to_airflow(self, tmp_path):
        query_file = (
            TEST_DIR
            / "data"
            / "test_sql"
            / "moz-fx-data-test-project"
            / "test"
            / "non_incremental_query_v1"
            / "query.sql"
        )

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "depends_on_past": True,
                "param": "test_param",
                "arguments": ["--append_table"],
            },
        )

        tasks = [Task.of_query(query_file, metadata)]

        default_args = {
            "depends_on_past": False,
            "owner": "*****@*****.**",
            "email": ["*****@*****.**"],
            "start_date": "2020-01-01",
            "retry_delay": "1h",
        }
        dags = DagCollection.from_dict(
            {
                "bqetl_test_dag": {
                    "schedule_interval": "daily",
                    "default_args": default_args,
                }
            }
        ).with_tasks(tasks)

        dags.to_airflow_dags(tmp_path)
        result = (tmp_path / "bqetl_test_dag.py").read_text().strip()
        expected = (TEST_DIR / "data" / "dags" / "simple_test_dag").read_text().strip()

        assert result == expected
Ejemplo n.º 20
0
    def test_multipart_task_get_dependencies(self, tmp_path):
        query_file_path = tmp_path / "test-project" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file_part1 = query_file_path / "part1.sql"
        query_file_part1.write_text(
            "SELECT * FROM `test-project`.test.table1_v1")

        query_file_part2 = query_file_path / "part2.sql"
        query_file_part2.write_text(
            "SELECT * FROM `test-project`.test.table2_v1")

        metadata = Metadata("test", "test", ["*****@*****.**"], {},
                            self.default_scheduling)

        task = Task.of_multipart_query(query_file_part1, metadata)

        table_task1 = Task.of_query(
            tmp_path / "test-project" / "test" / "table1_v1" / "query.sql",
            metadata,
        )
        table_task2 = Task.of_query(
            tmp_path / "test-project" / "test" / "table2_v1" / "query.sql",
            metadata,
        )

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-01-01",
                },
            }
        }).with_tasks([task, table_task1, table_task2])

        task.with_dependencies(dags)
        result = task.dependencies

        tables = [t.task_id for t in result]

        assert "test__table1__v1" in tables
        assert "test__table2__v1" in tables
Ejemplo n.º 21
0
    def test_task_get_dependencies_none(self, tmp_path, bigquery_client):
        query_file_path = tmp_path / "sql" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text("SELECT 123423")

        metadata = Metadata(
            "test",
            "test",
            {},
            {
                "dag_name": "test_dag",
                "depends_on_past": True,
                "param": "test_param"
            },
        )

        task = Task(query_file, metadata)
        dags = DagCollection.from_dict({})
        assert task.get_dependencies(bigquery_client, dags) == []
Ejemplo n.º 22
0
    def test_to_airflow_with_dependencies(self, tmp_path):
        query_file_path = tmp_path / "test-project" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text(
            "SELECT * FROM `test-project`.test.table1_v1 "
            "UNION ALL SELECT * FROM `test-project`.test.table2_v1 "
            "UNION ALL SELECT * FROM `test-project`.test.external_table_v1")

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "default_args": {
                    "owner": "*****@*****.**"
                },
            },
        )

        task = Task.of_query(query_file, metadata)

        table_task1 = Task.of_query(
            tmp_path / "test-project" / "test" / "table1_v1" / "query.sql",
            metadata,
        )

        os.makedirs(tmp_path / "test-project" / "test" / "table1_v1")
        query_file = tmp_path / "test-project" / "test" / "table1_v1" / "query.sql"
        query_file.write_text("SELECT 1")

        table_task2 = Task.of_query(
            tmp_path / "test-project" / "test" / "table2_v1" / "query.sql",
            metadata,
        )

        os.makedirs(tmp_path / "test-project" / "test" / "table2_v1")
        query_file = tmp_path / "test-project" / "test" / "table2_v1" / "query.sql"
        query_file.write_text("SELECT 2")

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_external_test_dag",
                "default_args": {
                    "owner": "*****@*****.**"
                },
            },
        )

        external_table_task = Task.of_query(
            tmp_path / "test-project" / "test" / "external_table_v1" /
            "query.sql",
            metadata,
        )

        os.makedirs(tmp_path / "test-project" / "test" / "external_table_v1")
        query_file = (tmp_path / "test-project" / "test" /
                      "external_table_v1" / "query.sql")
        query_file.write_text("SELECT 3")

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-05-25",
                },
            },
            "bqetl_external_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-05-25",
                },
            },
        }).with_tasks([task, table_task1, table_task2, external_table_task])

        dags.to_airflow_dags(tmp_path)

        expected_dag_with_dependencies = (
            (TEST_DIR / "data" / "dags" /
             "test_dag_with_dependencies").read_text().strip())
        expected_dag_external_dependency = (
            (TEST_DIR / "data" / "dags" /
             "test_dag_external_dependency").read_text().strip())

        dag_with_dependencies = (tmp_path /
                                 "bqetl_test_dag.py").read_text().strip()
        dag_external_dependency = ((
            tmp_path / "bqetl_external_test_dag.py").read_text().strip())

        assert dag_with_dependencies == expected_dag_with_dependencies
        assert dag_external_dependency == expected_dag_external_dependency
Ejemplo n.º 23
0
 def test_dags_from_invalid_dict(self):
     with pytest.raises(DagParseException):
         DagCollection.from_dict({"foo": "bar"})
Ejemplo n.º 24
0
 def test_dags_from_empty_dict(self):
     dags = DagCollection.from_dict({})
     assert len(dags.dags) == 0
Ejemplo n.º 25
0
    def test_to_airflow_with_dependencies(self, tmp_path, project_id,
                                          temporary_dataset, bigquery_client):
        query_file_path = tmp_path / "sql" / temporary_dataset / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text(
            f"SELECT * FROM {project_id}.{temporary_dataset}.table1_v1 " +
            f"UNION ALL SELECT * FROM {project_id}.{temporary_dataset}.table2_v1 "
            + "UNION ALL SELECT * FROM " +
            f"{project_id}.{temporary_dataset}.external_table_v1")

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table1_v1",
                               schema=schema)
        bigquery_client.create_table(table)
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.table2_v1",
                               schema=schema)
        bigquery_client.create_table(table)

        table = bigquery.Table(
            f"{project_id}.{temporary_dataset}.external_table_v1",
            schema=schema)
        bigquery_client.create_table(table)

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_test_dag",
                "default_args": {
                    "owner": "*****@*****.**"
                },
            },
        )

        task = Task.of_query(query_file, metadata)

        table_task1 = Task.of_query(
            tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql",
            metadata)

        os.makedirs(tmp_path / "sql" / temporary_dataset / "table1_v1")
        query_file = tmp_path / "sql" / temporary_dataset / "table1_v1" / "query.sql"
        query_file.write_text("SELECT 1")

        table_task2 = Task.of_query(
            tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql",
            metadata)

        os.makedirs(tmp_path / "sql" / temporary_dataset / "table2_v1")
        query_file = tmp_path / "sql" / temporary_dataset / "table2_v1" / "query.sql"
        query_file.write_text("SELECT 2")

        metadata = Metadata(
            "test",
            "test",
            ["*****@*****.**"],
            {},
            {
                "dag_name": "bqetl_external_test_dag",
                "default_args": {
                    "owner": "*****@*****.**"
                },
            },
        )

        external_table_task = Task.of_query(
            tmp_path / "sql" / temporary_dataset / "external_table_v1" /
            "query.sql",
            metadata,
        )

        os.makedirs(tmp_path / "sql" / temporary_dataset / "external_table_v1")
        query_file = (tmp_path / "sql" / temporary_dataset /
                      "external_table_v1" / "query.sql")
        query_file.write_text("SELECT 3")

        dags = DagCollection.from_dict({
            "bqetl_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-05-25",
                },
            },
            "bqetl_external_test_dag": {
                "schedule_interval": "daily",
                "default_args": {
                    "owner": "*****@*****.**",
                    "start_date": "2020-05-25",
                },
            },
        }).with_tasks([task, table_task1, table_task2, external_table_task])

        dags.to_airflow_dags(tmp_path)

        # we need to use templates since the temporary dataset name changes between runs
        env = Environment(loader=PackageLoader("tests", "data/dags"))

        dag_template_with_dependencies = env.get_template(
            "test_dag_with_dependencies")
        dag_template_external_dependency = env.get_template(
            "test_dag_external_dependency")

        args = {"temporary_dataset": temporary_dataset}

        expected_dag_with_dependencies = dag_template_with_dependencies.render(
            args)
        expected_dag_external_dependency = dag_template_external_dependency.render(
            args)

        dag_with_dependencies = (tmp_path /
                                 "bqetl_test_dag.py").read_text().strip()
        dag_external_dependency = ((
            tmp_path / "bqetl_external_test_dag.py").read_text().strip())

        assert dag_with_dependencies == expected_dag_with_dependencies
        assert dag_external_dependency == expected_dag_external_dependency
Ejemplo n.º 26
0
    def test_dags_from_empty_file(self, tmp_path):
        dags_file = tmp_path / "dags.yaml"
        dags_file.write_text("")
        dags = DagCollection.from_file(dags_file)

        assert len(dags.dags) == 0