def test_bigquery_operator_extra_link_when_single_query(self, mock_hook, session): bigquery_task = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() session.query(XCom).delete() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = '12345' ti.xcom_push(key='job_id', value=job_id) self.assertEqual( 'https://console.cloud.google.com/bigquery?j={job_id}'.format(job_id=job_id), bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name), ) self.assertEqual( '', bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name), )
def test_execute_bad_type(self, mock_hook): operator = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql=1, destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, gcp_conn_id='google_cloud_default', udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) with self.assertRaises(AirflowException): operator.execute(MagicMock())
def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='Select * from test_table', dag=self.dag, default_args=self.args, schema_update_options=None ) operator.execute(MagicMock()) mock_hook.return_value \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=None, query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, encryption_configuration=None ) self.assertTrue(isinstance(operator.sql, str)) ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE) ti.render_templates() self.assertTrue(isinstance(ti.task.sql, str))
def test_bigquery_operator_extra_link_when_multiple_query(self, mock_hook, session): bigquery_task = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'], dag=self.dag, ) self.dag.clear() session.query(XCom).delete() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = ['123', '45'] ti.xcom_push(key='job_id', value=job_id) self.assertEqual( {'BigQuery Console #1', 'BigQuery Console #2'}, bigquery_task.operator_extra_link_dict.keys() ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=123', bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'), ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=45', bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'), )
def _create_dag(): with DAG(dag_id="TEST_DAG_ID", default_args=dict(start_date=days_ago(2), )) as dag: BigQueryExecuteQueryOperator(task_id="TEST_SINGLE_QUERY", sql="SELECT 1") BigQueryExecuteQueryOperator(task_id="TEST_MULTIPLE_QUERY", sql=["SELECT 1", "SELECT 2"]) return dag
def test_bigquery_operator_extra_link_when_missing_job_id(self, mock_hook, session): bigquery_task = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() session.query(XCom).delete() assert '' == bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name)
def test_execute(self, mock_hook): encryption_configuration = {'key': 'kk'} operator = BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, gcp_conn_id='google_cloud_default', udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, encryption_configuration=encryption_configuration ) operator.execute(MagicMock()) mock_hook.return_value \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, encryption_configuration=encryption_configuration )
def test_bigquery_operator_extra_serialized_field_when_multiple_queries( self): with self.dag: BigQueryExecuteQueryOperator( task_id=TASK_ID, sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'], ) serialized_dag = SerializedDAG.to_dict(self.dag) self.assertIn("sql", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict[TASK_ID] self.assertEqual( getattr(simple_task, "sql"), ['SELECT * FROM test_table', 'SELECT * FROM test_table2']) ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [ { 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink': { 'index': 0 } }, { 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink': { 'index': 1 } }, ], ) # Check DeSerialized version of operator link self.assertIsInstance( list(simple_task.operator_extra_links)[0], BigQueryConsoleIndexableLink) ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE) job_id = ['123', '45'] ti.xcom_push(key='job_id', value=job_id) self.assertEqual({'BigQuery Console #1', 'BigQuery Console #2'}, simple_task.operator_extra_link_dict.keys()) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=123', simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'), ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=45', simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'), )
def test_bigquery_operator_extra_serialized_field_when_single_query(self): with self.dag: BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', ) serialized_dag = SerializedDAG.to_dict(self.dag) assert "sql" in serialized_dag["dag"]["tasks"][0] dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict[TASK_ID] assert getattr(simple_task, "sql") == 'SELECT * FROM test_table' ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link assert serialized_dag["dag"]["tasks"][0]["_operator_extra_links"] == [ {'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleLink': {}} ] # Check DeSerialized version of operator link assert isinstance(list(simple_task.operator_extra_links)[0], BigQueryConsoleLink) ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE) ti.xcom_push('job_id', 12345) # check for positive case url = simple_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name) assert url == 'https://console.cloud.google.com/bigquery?j=12345' # check for negative case url2 = simple_task.get_extra_links(datetime(2017, 1, 2), BigQueryConsoleLink.name) assert url2 == ''
def bq_operator(table_object, **kwargs): default_query = f"create or replace table {table_object['table_name']} as select * from {table_object['view_name']}" return BigQueryExecuteQueryOperator( task_id=f"load_{table_object['table_name']}", sql=default_query, use_legacy_sql=False, dag=dag)
def get_bq_operator_as_insert_data_with_query_params( dag: DAG, task_id: str, sql: str, destination_dataset_table: str, write_disposition: str = "WRITE_TRUNCATE", location: str = "asia-northeast1", gcp_conn_id: str = 'google_cloud_default', use_legacy_sql: bool = False, allow_large_results: bool = False, params: dict = None, on_failure_callback=None, on_success_callback=None, query_params: list = None) -> BigQueryExecuteQueryOperator: """Query_Parameter付きでインサート用のBigQueryExecuteQueryOperatorをコール Args: dag (str): [description] task_id (str): [description] sql (str): [description] destination_dataset_table (str): [description] write_disposition (str, optional): [description]. Defaults to "WRITE_TRUNCATE". location (str, optional): [description]. Defaults to "asia-northeast1". gcp_conn_id (str, optional): [description]. Defaults to 'google_cloud_default'. use_legacy_sql (bool, optional): [description]. Defaults to False. allow_large_results (bool, optional): [description]. Defaults to False. is_notification_failure (bool, optional): [description]. Defaults to False. is_notification_success (bool, optional): [description]. Defaults to False. query_params (list, optional): [description]. Defaults to None. Returns: BigQueryExecuteQueryOperator: [description] """ return BigQueryExecuteQueryOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, use_legacy_sql=use_legacy_sql, allow_large_results=allow_large_results, write_disposition=write_disposition, sql=sql, query_params=query_params, destination_dataset_table=destination_dataset_table, dag=dag, params=params, location=location, on_failure_callback=on_failure_callback, on_success_callback=on_success_callback)
def get_bq_operator_with_query_params( dag: DAG, task_id: str, sql: str, location: str = "asia-northeast1", gcp_conn_id: str = 'google_cloud_default', use_legacy_sql: bool = False, allow_large_results: bool = False, on_failure_callback=None, on_success_callback=None, query_params: list = None) -> BigQueryExecuteQueryOperator: """Query_Parameter付きでSQL実行用のBigQueryExecuteQueryOperatorをコール Args: task_id ([type], optional): [description]. Defaults to task_id. gcp_conn_id ([type], optional): [description]. Defaults to gcp_conn_id. use_legacy_sql ([type], optional): [description]. Defaults to use_legacy_sql. allow_large_results ([type], optional): [description]. Defaults to allow_large_results. sql ([type], optional): [description]. Defaults to sql. query_params ([type], optional): [description]. Defaults to query_params. dag ([type], optional): [description]. Defaults to dag. location ([type], optional): [description]. Defaults to location. on_failure_callback ([type], optional): [description]. Defaults to on_failure_callback. is_notification_success ([type], optional): [description]. Defaults to is_notification_success. Returns: BigQueryExecuteQueryOperator: [description] """ return BigQueryExecuteQueryOperator( task_id=task_id, gcp_conn_id=gcp_conn_id, use_legacy_sql=use_legacy_sql, allow_large_results=allow_large_results, sql=sql, query_params=query_params, dag=dag, location=location, on_failure_callback=on_failure_callback, on_success_callback=on_success_callback)
from airflow import DAG from airflow.providers.google.cloud.operators.bigquery import BigQueryExecuteQueryOperator import pendulum with DAG( 'bq', default_args={'retries': 2}, description='ETL DAG tutorial', schedule_interval=None, start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example'], ) as dag: bq_op_1 = BigQueryExecuteQueryOperator( task_id='bq_op_1', gcp_conn_id='google_cloud_default', sql='select 1 as id', destination_dataset_table='lake.test', write_disposition='WRITE_TRUNCATE', allow_large_results=True, use_legacy_sql=False)
task_id='dbt_ad_reporting', command='cd dbt_ad_reporting ; ~/.local/bin/dbt run -m +ad_reporting', ssh_conn_id='dbtvm' ) ml_branch = BranchPythonOperator( task_id='ml_branch', python_callable=ml_branch, provide_context=True ) train_model = BigQueryExecuteQueryOperator( task_id="train_model", sql=TRAINING_QUERY, use_legacy_sql=False ) get_preds = BigQueryExecuteQueryOperator( task_id="get_predictions", sql=SERVING_QUERY, use_legacy_sql=False, destination_dataset_table=DATASET_NAME + "." + DESTINATION_TABLE, write_disposition="WRITE_APPEND" ) print_preds = BigQueryGetDataOperator( task_id="print_predictions",
delete_contents=True) # [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table", bucket=BUCKET_NAME, source_objects=[ f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}" f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata" ], source_format="DATASTORE_BACKUP", destination_project_dataset_table= f"{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator( task_id="execute_query", sql= f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`", use_legacy_sql=False, ) # Firestore export_database_to_gcs >> create_dataset # BigQuery create_dataset >> create_external_table_multiple_types create_external_table_multiple_types >> read_data_from_gcs_multiple_types read_data_from_gcs_multiple_types >> delete_dataset
# [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table_multiple_types", bucket=GCS_BUCKET, source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"], source_format="NEWLINE_DELIMITED_JSON", destination_project_dataset_table= f"{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}", schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator( task_id="read_data_from_gcs_multiple_types", sql= f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}`", use_legacy_sql=False, ) # [START howto_operator_presto_to_gcs_many_chunks] presto_to_gcs_many_chunks = PrestoToGCSOperator( task_id="presto_to_gcs_many_chunks", sql=f"select * from {SOURCE_CUSTOMER_TABLE}", bucket=GCS_BUCKET, filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}.{{}}.json", schema_filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}-schema.json", approx_max_file_size_bytes=10_000_000, gzip=False, ) # [END howto_operator_presto_to_gcs_many_chunks]
"test-bigquery-sample-data") with models.DAG( "example_bigquery", default_args=default_args, schedule_interval=None, # Override to match your needs tags=['example'], ) as dag: execute_query = BigQueryExecuteQueryOperator( task_id="execute_query", sql=MOST_VALUABLE_INCOMING_TRANSACTIONS, use_legacy_sql=False, query_params=[{ "name": "to_address", "parameterType": { "type": "STRING" }, "parameterValue": { "value": WALLET_ADDRESS }, }], ) bigquery_execute_multi_query = BigQueryExecuteQueryOperator( task_id="execute_multi_query", sql=[MOST_VALUABLE_INCOMING_TRANSACTIONS, MOST_ACTIVE_PLAYERS], use_legacy_sql=False, query_params=[{ "name": "to_address", "parameterType": { "type": "STRING"
# [START howto_operator_bigquery_select_job] select_query_job = BigQueryInsertJobOperator( task_id="select_query_job", configuration={ "query": { "query": "{% include 'example_bigquery_query.sql' %}", "useLegacySql": False, } }, location=location, ) # [END howto_operator_bigquery_select_job] execute_insert_query = BigQueryExecuteQueryOperator( task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False, location=location ) bigquery_execute_multi_query = BigQueryExecuteQueryOperator( task_id="execute_multi_query", sql=[ f"SELECT * FROM {DATASET_NAME}.{TABLE_2}", f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_2}", ], use_legacy_sql=False, location=location, ) execute_query_save = BigQueryExecuteQueryOperator( task_id="execute_query_save", sql=f"SELECT * FROM {DATASET_NAME}.{TABLE_1}",
schema_fields=SCHEMA, time_partitioning={ "type": "DAY", "field": "ds", }, ) # [START howto_sensor_bigquery_table] check_table_exists = BigQueryTableExistenceSensor( task_id="check_table_exists", project_id=PROJECT_ID, dataset_id=DATASET_NAME, table_id=TABLE_NAME) # [END howto_sensor_bigquery_table] execute_insert_query = BigQueryExecuteQueryOperator( task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False) # [START howto_sensor_bigquery_table_partition] check_table_partition_exists = BigQueryTablePartitionExistenceSensor( task_id="check_table_partition_exists", project_id=PROJECT_ID, dataset_id=DATASET_NAME, table_id=TABLE_NAME, partition_id=PARTITION_NAME, ) # [END howto_sensor_bigquery_table_partition] delete_dataset = BigQueryDeleteDatasetOperator(task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True)
run_operator = FacebookAdsReportToGcsOperator( task_id='run_fetch_data', start_date=days_ago(2), owner='airflow', bucket_name=GCS_BUCKET, params=PARAMS, fields=FIELDS, gcp_conn_id=GCS_CONN_ID, object_name=GCS_OBJ_PATH, ) # [END howto_FB_ADS_to_gcs_operator] # [START howto_operator_gcs_to_bq] load_csv = GCSToBigQueryOperator( task_id='gcs_to_bq_example', bucket=GCS_BUCKET, source_objects=[GCS_OBJ_PATH], destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}", write_disposition='WRITE_TRUNCATE') # [END howto_operator_gcs_to_bq] # [START howto_operator_read_data_from_gcs] read_data_from_gcs_many_chunks = BigQueryExecuteQueryOperator( task_id="read_data_from_gcs_many_chunks", sql=f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`", use_legacy_sql=False, ) # [END howto_operator_read_data_from_gcs] create_dataset >> create_table >> run_operator >> load_csv >> read_data_from_gcs_many_chunks
}, ], destination_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "ratings${{ ds_nodash }}"), dag=dag, ) query_top_ratings = BigQueryExecuteQueryOperator( task_id="query_top_ratings", destination_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), sql=("SELECT movieid, AVG(rating) as avg_rating, COUNT(*) as num_ratings " "FROM " + os.environ["BIGQUERY_DATASET"] + ".ratings " "WHERE DATE(timestamp) <= DATE({{ ds }}) " "GROUP BY movieid " "ORDER BY avg_rating DESC"), write_disposition="WRITE_TRUNCATE", create_disposition="CREATE_IF_NEEDED", bigquery_conn_id="gcp", dag=dag, ) extract_top_ratings = BigQueryToGCSOperator( task_id="extract_top_ratings", source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), destination_cloud_storage_uris=[ "gs://" + os.environ["RESULT_BUCKET"] + "/{{ ds_nodash }}.csv"