Beispiel #1
1
    def test_bigquery_operator_extra_link_when_single_query(self, mock_hook, session):
        bigquery_task = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql='SELECT * FROM test_table',
            dag=self.dag,
        )
        self.dag.clear()
        session.query(XCom).delete()

        ti = TaskInstance(
            task=bigquery_task,
            execution_date=DEFAULT_DATE,
        )

        job_id = '12345'
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j={job_id}'.format(job_id=job_id),
            bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name),
        )

        self.assertEqual(
            '',
            bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name),
        )
Beispiel #2
0
    def test_execute_bad_type(self, mock_hook):
        operator = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql=1,
            destination_dataset_table=None,
            write_disposition='WRITE_EMPTY',
            allow_large_results=False,
            flatten_results=None,
            gcp_conn_id='google_cloud_default',
            udf_config=None,
            use_legacy_sql=True,
            maximum_billing_tier=None,
            maximum_bytes_billed=None,
            create_disposition='CREATE_IF_NEEDED',
            schema_update_options=(),
            query_params=None,
            labels=None,
            priority='INTERACTIVE',
            time_partitioning=None,
            api_resource_configs=None,
            cluster_fields=None,
        )

        with self.assertRaises(AirflowException):
            operator.execute(MagicMock())
Beispiel #3
0
    def test_bigquery_operator_defaults(self, mock_hook):
        operator = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
            dag=self.dag,
            default_args=self.args,
            schema_update_options=None
        )

        operator.execute(MagicMock())
        mock_hook.return_value \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=None,
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
                encryption_configuration=None
            )
        self.assertTrue(isinstance(operator.sql, str))
        ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE)
        ti.render_templates()
        self.assertTrue(isinstance(ti.task.sql, str))
Beispiel #4
0
    def test_bigquery_operator_extra_link_when_multiple_query(self, mock_hook, session):
        bigquery_task = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'],
            dag=self.dag,
        )
        self.dag.clear()
        session.query(XCom).delete()

        ti = TaskInstance(
            task=bigquery_task,
            execution_date=DEFAULT_DATE,
        )

        job_id = ['123', '45']
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual(
            {'BigQuery Console #1', 'BigQuery Console #2'},
            bigquery_task.operator_extra_link_dict.keys()
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=123',
            bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'),
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=45',
            bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'),
        )
 def _create_dag():
     with DAG(dag_id="TEST_DAG_ID",
              default_args=dict(start_date=days_ago(2), )) as dag:
         BigQueryExecuteQueryOperator(task_id="TEST_SINGLE_QUERY",
                                      sql="SELECT 1")
         BigQueryExecuteQueryOperator(task_id="TEST_MULTIPLE_QUERY",
                                      sql=["SELECT 1", "SELECT 2"])
     return dag
Beispiel #6
0
    def test_bigquery_operator_extra_link_when_missing_job_id(self, mock_hook, session):
        bigquery_task = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql='SELECT * FROM test_table',
            dag=self.dag,
        )
        self.dag.clear()
        session.query(XCom).delete()

        assert '' == bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name)
Beispiel #7
0
    def test_execute(self, mock_hook):
        encryption_configuration = {'key': 'kk'}

        operator = BigQueryExecuteQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
            destination_dataset_table=None,
            write_disposition='WRITE_EMPTY',
            allow_large_results=False,
            flatten_results=None,
            gcp_conn_id='google_cloud_default',
            udf_config=None,
            use_legacy_sql=True,
            maximum_billing_tier=None,
            maximum_bytes_billed=None,
            create_disposition='CREATE_IF_NEEDED',
            schema_update_options=(),
            query_params=None,
            labels=None,
            priority='INTERACTIVE',
            time_partitioning=None,
            api_resource_configs=None,
            cluster_fields=None,
            encryption_configuration=encryption_configuration
        )

        operator.execute(MagicMock())
        mock_hook.return_value \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
                encryption_configuration=encryption_configuration
            )
Beispiel #8
0
    def test_bigquery_operator_extra_serialized_field_when_multiple_queries(
            self):
        with self.dag:
            BigQueryExecuteQueryOperator(
                task_id=TASK_ID,
                sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'],
            )
        serialized_dag = SerializedDAG.to_dict(self.dag)
        self.assertIn("sql", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[TASK_ID]
        self.assertEqual(
            getattr(simple_task, "sql"),
            ['SELECT * FROM test_table', 'SELECT * FROM test_table2'])

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################

        # Check Serialized version of operator link
        self.assertEqual(
            serialized_dag["dag"]["tasks"][0]["_operator_extra_links"],
            [
                {
                    'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink':
                    {
                        'index': 0
                    }
                },
                {
                    'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink':
                    {
                        'index': 1
                    }
                },
            ],
        )

        # Check DeSerialized version of operator link
        self.assertIsInstance(
            list(simple_task.operator_extra_links)[0],
            BigQueryConsoleIndexableLink)

        ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE)
        job_id = ['123', '45']
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual({'BigQuery Console #1', 'BigQuery Console #2'},
                         simple_task.operator_extra_link_dict.keys())

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=123',
            simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'),
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=45',
            simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'),
        )
Beispiel #9
0
    def test_bigquery_operator_extra_serialized_field_when_single_query(self):
        with self.dag:
            BigQueryExecuteQueryOperator(
                task_id=TASK_ID,
                sql='SELECT * FROM test_table',
            )
        serialized_dag = SerializedDAG.to_dict(self.dag)
        assert "sql" in serialized_dag["dag"]["tasks"][0]

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[TASK_ID]
        assert getattr(simple_task, "sql") == 'SELECT * FROM test_table'

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################

        # Check Serialized version of operator link
        assert serialized_dag["dag"]["tasks"][0]["_operator_extra_links"] == [
            {'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleLink': {}}
        ]

        # Check DeSerialized version of operator link
        assert isinstance(list(simple_task.operator_extra_links)[0], BigQueryConsoleLink)

        ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE)
        ti.xcom_push('job_id', 12345)

        # check for positive case
        url = simple_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name)
        assert url == 'https://console.cloud.google.com/bigquery?j=12345'

        # check for negative case
        url2 = simple_task.get_extra_links(datetime(2017, 1, 2), BigQueryConsoleLink.name)
        assert url2 == ''
Beispiel #10
0
def bq_operator(table_object, **kwargs):
    default_query = f"create or replace table {table_object['table_name']}  as select * from {table_object['view_name']}"
    return BigQueryExecuteQueryOperator(
        task_id=f"load_{table_object['table_name']}",
        sql=default_query,
        use_legacy_sql=False,
        dag=dag)
def get_bq_operator_as_insert_data_with_query_params(
        dag: DAG,
        task_id: str,
        sql: str,
        destination_dataset_table: str,
        write_disposition: str = "WRITE_TRUNCATE",
        location: str = "asia-northeast1",
        gcp_conn_id: str = 'google_cloud_default',
        use_legacy_sql: bool = False,
        allow_large_results: bool = False,
        params: dict = None,
        on_failure_callback=None,
        on_success_callback=None,
        query_params: list = None) -> BigQueryExecuteQueryOperator:
    """Query_Parameter付きでインサート用のBigQueryExecuteQueryOperatorをコール

    Args:
        dag (str): [description]
        task_id (str): [description]
        sql (str): [description]
        destination_dataset_table (str): [description]
        write_disposition (str, optional): [description]. Defaults to "WRITE_TRUNCATE".
        location (str, optional): [description]. Defaults to "asia-northeast1".
        gcp_conn_id (str, optional): [description]. Defaults to 'google_cloud_default'.
        use_legacy_sql (bool, optional): [description]. Defaults to False.
        allow_large_results (bool, optional): [description]. Defaults to False.
        is_notification_failure (bool, optional): [description]. Defaults to False.
        is_notification_success (bool, optional): [description]. Defaults to False.
        query_params (list, optional): [description]. Defaults to None.

    Returns:
        BigQueryExecuteQueryOperator: [description]
    """

    return BigQueryExecuteQueryOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        use_legacy_sql=use_legacy_sql,
        allow_large_results=allow_large_results,
        write_disposition=write_disposition,
        sql=sql,
        query_params=query_params,
        destination_dataset_table=destination_dataset_table,
        dag=dag,
        params=params,
        location=location,
        on_failure_callback=on_failure_callback,
        on_success_callback=on_success_callback)
def get_bq_operator_with_query_params(
        dag: DAG,
        task_id: str,
        sql: str,
        location: str = "asia-northeast1",
        gcp_conn_id: str = 'google_cloud_default',
        use_legacy_sql: bool = False,
        allow_large_results: bool = False,
        on_failure_callback=None,
        on_success_callback=None,
        query_params: list = None) -> BigQueryExecuteQueryOperator:
    """Query_Parameter付きでSQL実行用のBigQueryExecuteQueryOperatorをコール

  Args:
      task_id ([type], optional): [description]. Defaults to task_id.
      gcp_conn_id ([type], optional): [description]. Defaults to gcp_conn_id.
      use_legacy_sql ([type], optional): [description]. Defaults to use_legacy_sql.
      allow_large_results ([type], optional): [description]. Defaults to allow_large_results.
      sql ([type], optional): [description]. Defaults to sql.
      query_params ([type], optional): [description]. Defaults to query_params.
      dag ([type], optional): [description]. Defaults to dag.
      location ([type], optional): [description]. Defaults to location.
      on_failure_callback ([type], optional): [description]. Defaults to on_failure_callback.
      is_notification_success ([type], optional): [description]. Defaults to is_notification_success.

  Returns:
      BigQueryExecuteQueryOperator: [description]
  """
    return BigQueryExecuteQueryOperator(
        task_id=task_id,
        gcp_conn_id=gcp_conn_id,
        use_legacy_sql=use_legacy_sql,
        allow_large_results=allow_large_results,
        sql=sql,
        query_params=query_params,
        dag=dag,
        location=location,
        on_failure_callback=on_failure_callback,
        on_success_callback=on_success_callback)
Beispiel #13
0
from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryExecuteQueryOperator
import pendulum

with DAG(
        'bq',
        default_args={'retries': 2},
        description='ETL DAG tutorial',
        schedule_interval=None,
        start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
        catchup=False,
        tags=['example'],
) as dag:
    bq_op_1 = BigQueryExecuteQueryOperator(
        task_id='bq_op_1',
        gcp_conn_id='google_cloud_default',
        sql='select 1 as id',
        destination_dataset_table='lake.test',
        write_disposition='WRITE_TRUNCATE',
        allow_large_results=True,
        use_legacy_sql=False)
        task_id='dbt_ad_reporting',
        command='cd dbt_ad_reporting ; ~/.local/bin/dbt run -m +ad_reporting',
        ssh_conn_id='dbtvm'
    )


    ml_branch = BranchPythonOperator(
        task_id='ml_branch',
        python_callable=ml_branch,
        provide_context=True
    )


    train_model = BigQueryExecuteQueryOperator(
        task_id="train_model",
        sql=TRAINING_QUERY,
        use_legacy_sql=False
    )


    get_preds = BigQueryExecuteQueryOperator(
        task_id="get_predictions",
        sql=SERVING_QUERY,
        use_legacy_sql=False,
        destination_dataset_table=DATASET_NAME + "." + DESTINATION_TABLE,
        write_disposition="WRITE_APPEND"
    )


    print_preds = BigQueryGetDataOperator(
        task_id="print_predictions",
Beispiel #15
0
                                                   delete_contents=True)

    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table",
        bucket=BUCKET_NAME,
        source_objects=[
            f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}"
            f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata"
        ],
        source_format="DATASTORE_BACKUP",
        destination_project_dataset_table=
        f"{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator(
        task_id="execute_query",
        sql=
        f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`",
        use_legacy_sql=False,
    )

    # Firestore
    export_database_to_gcs >> create_dataset

    # BigQuery
    create_dataset >> create_external_table_multiple_types
    create_external_table_multiple_types >> read_data_from_gcs_multiple_types
    read_data_from_gcs_multiple_types >> delete_dataset
    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table_multiple_types",
        bucket=GCS_BUCKET,
        source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"],
        source_format="NEWLINE_DELIMITED_JSON",
        destination_project_dataset_table=
        f"{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}",
        schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator(
        task_id="read_data_from_gcs_multiple_types",
        sql=
        f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}`",
        use_legacy_sql=False,
    )

    # [START howto_operator_presto_to_gcs_many_chunks]
    presto_to_gcs_many_chunks = PrestoToGCSOperator(
        task_id="presto_to_gcs_many_chunks",
        sql=f"select * from {SOURCE_CUSTOMER_TABLE}",
        bucket=GCS_BUCKET,
        filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}.{{}}.json",
        schema_filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}-schema.json",
        approx_max_file_size_bytes=10_000_000,
        gzip=False,
    )
    # [END howto_operator_presto_to_gcs_many_chunks]
Beispiel #17
0
                                         "test-bigquery-sample-data")

with models.DAG(
        "example_bigquery",
        default_args=default_args,
        schedule_interval=None,  # Override to match your needs
        tags=['example'],
) as dag:

    execute_query = BigQueryExecuteQueryOperator(
        task_id="execute_query",
        sql=MOST_VALUABLE_INCOMING_TRANSACTIONS,
        use_legacy_sql=False,
        query_params=[{
            "name": "to_address",
            "parameterType": {
                "type": "STRING"
            },
            "parameterValue": {
                "value": WALLET_ADDRESS
            },
        }],
    )

    bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
        task_id="execute_multi_query",
        sql=[MOST_VALUABLE_INCOMING_TRANSACTIONS, MOST_ACTIVE_PLAYERS],
        use_legacy_sql=False,
        query_params=[{
            "name": "to_address",
            "parameterType": {
                "type": "STRING"
        # [START howto_operator_bigquery_select_job]
        select_query_job = BigQueryInsertJobOperator(
            task_id="select_query_job",
            configuration={
                "query": {
                    "query": "{% include 'example_bigquery_query.sql' %}",
                    "useLegacySql": False,
                }
            },
            location=location,
        )
        # [END howto_operator_bigquery_select_job]

        execute_insert_query = BigQueryExecuteQueryOperator(
            task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False, location=location
        )

        bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
            task_id="execute_multi_query",
            sql=[
                f"SELECT * FROM {DATASET_NAME}.{TABLE_2}",
                f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_2}",
            ],
            use_legacy_sql=False,
            location=location,
        )

        execute_query_save = BigQueryExecuteQueryOperator(
            task_id="execute_query_save",
            sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
Beispiel #19
0
        schema_fields=SCHEMA,
        time_partitioning={
            "type": "DAY",
            "field": "ds",
        },
    )
    # [START howto_sensor_bigquery_table]
    check_table_exists = BigQueryTableExistenceSensor(
        task_id="check_table_exists",
        project_id=PROJECT_ID,
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME)
    # [END howto_sensor_bigquery_table]

    execute_insert_query = BigQueryExecuteQueryOperator(
        task_id="execute_insert_query",
        sql=INSERT_ROWS_QUERY,
        use_legacy_sql=False)

    # [START howto_sensor_bigquery_table_partition]
    check_table_partition_exists = BigQueryTablePartitionExistenceSensor(
        task_id="check_table_partition_exists",
        project_id=PROJECT_ID,
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME,
        partition_id=PARTITION_NAME,
    )
    # [END howto_sensor_bigquery_table_partition]

    delete_dataset = BigQueryDeleteDatasetOperator(task_id="delete_dataset",
                                                   dataset_id=DATASET_NAME,
                                                   delete_contents=True)
Beispiel #20
0
    run_operator = FacebookAdsReportToGcsOperator(
        task_id='run_fetch_data',
        start_date=days_ago(2),
        owner='airflow',
        bucket_name=GCS_BUCKET,
        params=PARAMS,
        fields=FIELDS,
        gcp_conn_id=GCS_CONN_ID,
        object_name=GCS_OBJ_PATH,
    )
    # [END howto_FB_ADS_to_gcs_operator]

    # [START howto_operator_gcs_to_bq]
    load_csv = GCSToBigQueryOperator(
        task_id='gcs_to_bq_example',
        bucket=GCS_BUCKET,
        source_objects=[GCS_OBJ_PATH],
        destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
        write_disposition='WRITE_TRUNCATE')
    # [END howto_operator_gcs_to_bq]

    # [START howto_operator_read_data_from_gcs]
    read_data_from_gcs_many_chunks = BigQueryExecuteQueryOperator(
        task_id="read_data_from_gcs_many_chunks",
        sql=f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`",
        use_legacy_sql=False,
    )
    # [END howto_operator_read_data_from_gcs]

    create_dataset >> create_table >> run_operator >> load_csv >> read_data_from_gcs_many_chunks
Beispiel #21
0
        },
    ],
    destination_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                                       os.environ["BIGQUERY_DATASET"] + "." +
                                       "ratings${{ ds_nodash }}"),
    dag=dag,
)

query_top_ratings = BigQueryExecuteQueryOperator(
    task_id="query_top_ratings",
    destination_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                               os.environ["BIGQUERY_DATASET"] + "." +
                               "rating_results_{{ ds_nodash }}"),
    sql=("SELECT movieid, AVG(rating) as avg_rating, COUNT(*) as num_ratings "
         "FROM " + os.environ["BIGQUERY_DATASET"] + ".ratings "
         "WHERE DATE(timestamp) <= DATE({{ ds }}) "
         "GROUP BY movieid "
         "ORDER BY avg_rating DESC"),
    write_disposition="WRITE_TRUNCATE",
    create_disposition="CREATE_IF_NEEDED",
    bigquery_conn_id="gcp",
    dag=dag,
)

extract_top_ratings = BigQueryToGCSOperator(
    task_id="extract_top_ratings",
    source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                                  os.environ["BIGQUERY_DATASET"] + "." +
                                  "rating_results_{{ ds_nodash }}"),
    destination_cloud_storage_uris=[
        "gs://" + os.environ["RESULT_BUCKET"] + "/{{ ds_nodash }}.csv"