def add_numeric_stats_task( dag: models.DAG, feature_vars: dag_utils.FeatureConfigListMapping, storage_vars: dag_utils.AirflowVarsConfig ) -> bigquery_operator.BigQueryOperator: """Builds an Operator that generates numeric fact stats within a DAG. Args: dag: The dag that the task needs to be added to. feature_vars: The parsed config values from airflow feature object variable. storage_vars: The parsed config values from airflow storage variable. Returns: Operator used to build numeric stats within a DAG. """ num_feats = dag_utils.get_features(feature_vars, 'type', 'Numeric') stats_dataset = (f'{storage_vars["bq_working_project"]}.' f'{storage_vars["bq_working_dataset"]}') numeric_stats_sql = pipeline_utils.render_sql_from_template( 'numeric_stats', fact_table=f'{stats_dataset}.facts', feature_columns=[ f'\'{dag_utils.get_feature_name(x)}\'' for x in num_feats ]) return bigquery_operator.BigQueryOperator( task_id=_GENERATE_NUMERIC_STATS_TASK, sql=numeric_stats_sql, use_legacy_sql=False, destination_dataset_table=f'{stats_dataset}.num_facts_stats_table', create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, dag=dag)
def _store_final_results_to_bq(dag: models.DAG, task_id: str, batch_predict_sql: str) -> models.BaseOperator: """Store MP complaint results in Bigquery before GA transfer. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. batch_predict_sql: Custom Query to pick records and add some additional colums as MP protocol. Returns: Operator to use within a DAG to store Prediction results to Bigquery. """ bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) final_output_table = '{project}.{dataset}.final_output'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) return bigquery_operator.BigQueryOperator( task_id=task_id, sql=batch_predict_sql, use_legacy_sql=False, destination_dataset_table=final_output_table, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, location=bb_vars['gcp_region'], dag=dag, )
def add_prepare_source_data_task_to_dag( dag: models.DAG, sql: str, leads_table: str, gcp_region: str) -> bigquery_operator.BigQueryOperator: """Adds the BQ task to dag to create ML source table from GA source table. Args: dag: The dag that the task needs to be added to. sql: The parsed config values from airflow prediction variable. leads_table: The destination table the output data will be written to. gcp_region: GCP region the job will run in. Returns: The configured BigQueryOperator task that was added to the input dag. """ prepare_source_data = bigquery_operator.BigQueryOperator( task_id=TASK_NAME, sql=sql, use_legacy_sql=False, destination_dataset_table=leads_table, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, location=gcp_region, dag=dag, ) return prepare_source_data
def _add_prepare_automl_data_in_bq_task( dag: models.DAG, output_type: blockbuster_constants.PreprocessingType, prediction_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig ) -> bigquery_operator.BigQueryOperator: """Adds the task to write the output to Big Query to dag. Args: dag: The dag that the task needs to be added to. output_type: Indicate whether this pipeline is to be used for training or prediction. prediction_vars: The parsed config values from airflow prediction variable. storage_vars: The parsed config values from airflow storage variable. Returns: The configured BigQueryOperator task to write input data for automl that was added to the dag. """ exclude_from_output = ['userId', 'RECENT_BB_id', 'RECENT_most_recent_lead'] if output_type == blockbuster_constants.PreprocessingType.TRAINING: output_table = 'training' exclude_from_output.append('BB_id') elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: output_table = 'prediction' exclude_from_output.append('MLDataSplit') features_table = dag_utils.construct_bq_table_path( storage_vars['bq_working_project'], storage_vars['bq_working_dataset'], f'ga_{output_table}_input') prepare_data_sql = pipeline_utils.render_sql_from_template( 'prepare_data', features_table=features_table, exclude_from_output=exclude_from_output, inclusion_recency_days=prediction_vars['leads_submission_window']) output_dataset = dag_utils.construct_bq_table_path( storage_vars['bq_working_project'], storage_vars['bq_working_dataset'], f'automl_{output_table}_input') prepare_data_for_automl = bigquery_operator.BigQueryOperator( task_id='prepare_data_for_automl', sql=prepare_data_sql, use_legacy_sql=False, destination_dataset_table=output_dataset, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, dag=dag, ) return prepare_data_for_automl
def create_bq_operator_dict_from(sql_statements_dictionary): d = sql_statements_dictionary result = dict((k, '') for k in d) # iterating through the keys. for k in d: sql = d[k].decode() tid = k.replace('.sql', '') tid = tid.replace('tmp/', '') b = bigquery_operator.BigQueryOperator( task_id='make_tmp_table_{}'.format(tid), sql=sql, use_legacy_sql=False, location='asia-southeast1', destination_dataset_table=gcp_project + '.tmp.' + tid, write_disposition='WRITE_TRUNCATE') result[k] = b return result
source_format='NEWLINE_DELIMITED_JSON') # リスト6-5. gcpbook_ch5.dauテーブルへの書き込みタスクの定義 # BigQueryの作業用テーブルとユーザ情報テーブルを結合し、課金ユーザと # 無課金ユーザそれぞれのユーザ数を算出して、結果をgcpbook_ch5.dau # テーブルへ書き込むタスクを定義する。 insert_dau = bigquery_operator.BigQueryOperator(task_id='insert_dau', use_legacy_sql=False, sql=""" insert gcpbook_ch5.dau select date('{{ ds }}') as dt , countif(u.is_paid_user) as paid_users , countif(not u.is_paid_user) as free_to_play_users from ( select distinct user_pseudo_id from gcpbook_ch5.work_events ) e inner join gcpbook_ch5.users u on u.user_pseudo_id = e.user_pseudo_id """) # リスト6-6. 作業用テーブルを削除するタスクの定義 # BigQueryの作業用テーブルを削除するタスクを定義する。 delete_work_table = \ bigquery_table_delete_operator.BigQueryTableDeleteOperator( task_id='delete_work_table',
task_id='generate_account_index', python_callable=generate_accounts_views_index) project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery') temp_table = 'ga_domains' query = """ CREATE TABLE `{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}` OPTIONS( expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR) ) AS select distinct domain_name, hostname, ga_code from web_crawl.url_resource, unnest(google_analytics) as ga_code """ query_webcrawl_ga_index = bigquery_operator.BigQueryOperator( task_id='query_webcrawl_ga_index', bql=query, use_legacy_sql=False, params={ 'project_id': project_id, 'temp_table': temp_table, }) export_webcrawl_ga_index_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_webcrawl_ga_index_to_gcs', source_project_dataset_table= "{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}", params={ 'project_id': project_id, 'temp_table': temp_table }, destination_cloud_storage_uris=[ "gs://%s/data/%s.csv" % (models.Variable.get( 'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table)
'retry_delay': datetime.timedelta(minutes=5) } with models.DAG( 'bigquery_data_export_rds', # schedule_interval=datetime.timedelta(days=1), schedule_interval=None, catchup=False, default_args=default_dag_args) as dag: project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery') # BSQL script to pull the data from BigQuery bigquery_data_script = bigquery_operator.BigQueryOperator( task_id='bigquery_data_script', bql=pathlib.Path( dobs_constants.DAGS_DIR + "/bq_observatory_service/bq_sql_ga_accounts_data_query.sql"). read_text(), use_legacy_sql=False) # BigQuery data fetch bigquery_data_fetch = bigquery_get_data.BigQueryGetDataOperator( task_id='bigquery_data_fetch', dataset_id=dobs_constants.DATASET_ID, table_id=dobs_constants.TABLE_ID) def prepare_data(data_, id_): output = [] for datum in data_: agency_name, webproperty_id, domain_name, hostname, service_name = datum output.append({
from airflow.utils.dates import days_ago from datetime import timedelta # Query recent StackOverflow questions. default_args = { "owner": "airflow", "depends_on_past": False, "start_date": days_ago(1), "retries": 1, "retry_delay": timedelta(minutes=5), } # dag = DAG("tableau_dags", default_args=default_args,schedule_interval='@daily', catchup=False) dag = DAG("testbq", default_args=default_args, schedule_interval='0 10 * * * ', catchup=False) bq_recent_questions_query = bigquery_operator.BigQueryOperator( task_id='bq_recent_questions_query', bigquery_conn_id='bigquery_default', sql=""" SELECT * FROM `fr-stg-self-service-bi.sekitest.superstore_seki` LIMIT 500 """, use_legacy_sql=False, destination_dataset_table='fr-stg-self-service-bi.sekitest.superstore_seki2', write_disposition='WRITE_TRUNCATE', dag=dag) bq_recent_questions_query
`dta-ga-bigquery.{{params.view_id}}.ga_sessions_*`, UNNEST(hits) WHERE _TABLE_SUFFIX BETWEEN '{{params.start}}' AND '{{params.end}}' AND REGEXP_CONTAINS(page.pagePath, r"{{ params.search_param }}=(.*?)(?:&|$)") group by query order by count(*) desc """ query_internalsearch = bigquery_operator.BigQueryOperator( task_id='query_internalsearch_' + galileo.domain_slug(d['domain']), bql=query, use_legacy_sql=False, params={ 'project_id': project_id, 'view_id': d['view_id'], 'start': start, 'end': end, 'temp_table': temp_table, 'domain': d['domain'], 'search_param': d['search_param'] }) export_internalsearch_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_internalsearch_to_gcs_' + galileo.domain_slug(d['domain']), source_project_dataset_table="{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}", params= { 'project_id': project_id, 'temp_table': temp_table }, destination_cloud_storage_uris=[ "gs://%s/data/searchqueries/%s.csv" % ( models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table)],
from airflow.contrib.operators import bigquery_to_gcs from airflow.utils import trigger_rule default_dag_args = { 'start_date': airflow.utils.dates.days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), } output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv' #Replace <Your bucket> with your path details with DAG(dag_id='demo_bq_dag', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: bq_airflow_commits_query = bigquery_operator.BigQueryOperator( task_id='bq_airflow_commits_query', bql=""" SELECT Address FROM [airflow-studies:Address.Add] """) export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_airflow_commits_to_gcs', source_project_dataset_table='airflow-studies:Address.Add', destination_cloud_storage_uris=[output_file], export_format='CSV') bq_airflow_commits_query >> export_commits_to_gcs
bq_dataset_name = 'test.users_data_last' default_dag_args = { 'start_date': yesterday, # To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5) } # [START bigquery_extracton_test] with models.DAG( 'users_data_bigquery_load_test', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END bigquery_extracton_test] bq_extraction_test = bigquery_operator.BigQueryOperator( task_id='bq_extraction_test_query', bql=""" SELECT * FROM `test.users_data` """, use_legacy_sql=False, destination_dataset_table=bq_dataset_name) bq_extraction_test
} with models.DAG( 'pageviews_snapshot_doi', # schedule_interval=datetime.timedelta(days=1), schedule_interval='0 12 * * *', catchup=False, default_args=default_dag_args) as dag: project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery') # BigQuery Scripts # pageviews snapshot query_pageviews_snapshot = bigquery_operator.BigQueryOperator( task_id='query_pageviews_snapshot', priority='BATCH', bql=pathlib.Path(galileo.DAGS_DIR + "/bq_scripts_doi/dta_sql_pgvw_daily_snapshot_full_doi" ).read_text(), use_legacy_sql=False) query_pageviews_snapshot_delta = bigquery_operator.BigQueryOperator( task_id='query_pageviews_snapshot_delta', priority='BATCH', bql=pathlib.Path( galileo.DAGS_DIR + "/bq_scripts_doi/dta_sql_pgvw_daily_snapshot_delta_doi").read_text( ), use_legacy_sql=False) # # total visitors snapshot # query_total_visitors_snapshot = bigquery_operator.BigQueryOperator(
} with models.DAG( 'bigquery_output_prototype', # schedule_interval=datetime.timedelta(days=1), schedule_interval=None, catchup=False, on_failure_callback=None, default_args=default_dag_args) as dag: project_id = models.Variable.get('GCP_PROJECT', 'dta-ga-bigquery') # BSQL script to pull the data from BigQuery bigquery_data_type1 = bigquery_operator.BigQueryOperator( task_id='bigquery_data_type1', priority='BATCH', bql=pathlib.Path( dobs_constants.DAGS_DIR + "/bq_observatory_service/bq_sql_exec_basics_weekly.sql").read_text( ), use_legacy_sql=False) bigquery_data_type2 = bigquery_operator.BigQueryOperator( task_id='bigquery_data_type2', priority='BATCH', bql=pathlib.Path(dobs_constants.DAGS_DIR + "/bq_observatory_service/bq_sql_exec_basics_daily.sql" ).read_text(), use_legacy_sql=False) bigquery_data_type3 = bigquery_operator.BigQueryOperator( task_id='bigquery_data_type3', priority='BATCH',
'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=2), 'project_id': models.Variable.get('gcp_project') } with models.DAG('current_products_on_sale_notification', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: query_current_sales_products = bigquery_operator.BigQueryOperator( task_id='query_products_on_sale', bql=""" SELECT product_id, product_name FROM `{bq_table_id}` WHERE sale = True """.format(bq_table_id=models.Variable.get('bq_table_id')), use_legacy_sql=False, destination_dataset_table=bq_products_on_sale_table_id, write_disposition='WRITE_TRUNCATE') export_data_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_sale_data_to_gcs', source_project_dataset_table=bq_products_on_sale_table_id, destination_cloud_storage_uris=[output_file], export_format='CSV') email_updation_notification = email_operator.EmailOperator( task_id='email_notification', to=email_id, subject='Sale product data updated',
'start_date': airflow.utils.dates.days_ago(2) } with models.DAG( dag_id='load_data_from_files_v2', default_args=default_dag_args, schedule_interval=None )as dag: # [START howto_operator_gcs_to_bq] load_table_a = GoogleCloudStorageToBigQueryOperator( task_id='load_table_a', bucket='fs-storage-cb-sh', source_objects=['LPMXfile_table_a_YYMMDD.txt'], field_delimiter='|', destination_project_dataset_table='gcp_schema_bigquery.gcp_table_a', write_disposition='WRITE_TRUNCATE', dag=dag) # [END howto_operator_gcs_to_bq] copy_data_to_b = bigquery_operator.BigQueryOperator( task_id='copy_data_to_b', sql=""" insert into {0}.TABLE_B select * from {1}.gcp_table_a """.format(dm_ex,dm_ex), big_query_conn_id='bigquery_default', write_disposition='WRITE_TRUNCATE', allow_large_results=True, use_legacy_sql=False, dag=dag) load_table_a >> copy_data_to_b
} output_file = 'gs://us-east1-example-environmen-3bb7fd7a-bucket/data/product.csv' dag = DAG( 'bq_export_dag', 'catchup=False', default_args=default_args, schedule_interval="@once", ) start_task = dummy_operator.DummyOperator( task_id='start', default_args=default_args, dag=dag, ) bq_query_task = bigquery_operator.BigQueryOperator( task_id='bq_query', bql='''SELECT * FROM [hd-personalization-dev.temp.Prod] ''', dag=dag) export_to_gcs_task = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_to_gcs', source_project_dataset_table='hd-personalization-dev.temp.Prod', destination_cloud_storage_uris=[output_file], export_format='CSV', dag=dag) start_task >> bq_query_task >> export_to_gcs_task
datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()) # [START composer_notify_failure] default_dag_args = { 'start_date': yesterday, 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': 'thermal-loop-237313' } with models.DAG('batman_bi_daily_report', schedule_interval=datetime.timedelta(hours=1), default_args=default_dag_args) as dag: bq_recent_questions_query = bigquery_operator.BigQueryOperator( task_id='bq_count_crimes_by_locations', bql=""" SELECT DATE(`date`, "America/Chicago") as report_day, primary_type, location_description, count(*) as crime_count FROM `batman_bi.crime_logs` GROUP BY 1, 2, 3 """, use_legacy_sql=False, destination_dataset_table='batman_bi.crime_count_by_location', write_disposition='WRITE_TRUNCATE')
task_id='delete_sl_partition', dag=dag, bash_command= '''bq rm -f -t 'dataops_demo_sl_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''', ) # We can of course think of something way more complex, here we are taking advantage of the json fiel so no need to reload everything from_raw_to_sl = bigquery_operator.BigQueryOperator( task_id='from_raw_to_sl', dag=dag, sql= '''SELECT PARSE_TIMESTAMP('%a %b %d %H:%M:%S +0000 %E4Y', created_at) AS c_timestamp, CAST(PARSE_TIMESTAMP('%a %b %d %H:%M:%S +0000 %E4Y', created_at) AS date) as c_created, id, user_id, user_name, lang, user_screen_name, text, JSON_EXTRACT(json, '$.source') AS source FROM dataops_demo_raw_dev.t_twitter_google_{{ macros.ds_format(ds, "%Y-%m-%d", "%Y_%m_%d") }}''', destination_dataset_table='dataops_demo_sl_dev.t_twitter_google', write_disposition='WRITE_APPEND', create_disposition='CREATE_IF_NEEDED', time_partitioning={ 'type': 'DAY', 'field': 'c_created' }, allow_large_results=True, use_legacy_sql=False) # twitter_spark_etl = dagrun_operator.TriggerDagRunOperator( # task_id="execute_other_dag", # trigger_dag_id="twitter_spark_etl", # Ensure this equals the dag_id of the DAG to trigger # conf={"job_date": "{{ ds }}"}, # No need for this parameter, please check execution_date # execution_date="{{ ds }}", # dag=dag,
bq_merge_table1 = bigquery_operator.BigQueryOperator(task_id='bq_merge_table1', bql=""" MERGE `{table}` T USING ( SELECT CASE WHEN a.id IS NULL AND b.id IS NOT NULL THEN "I" WHEN a.id IS NOT NULL AND b.id IS NULL THEN "D" ELSE "U" END AS op, IF (b.id IS NULL, a, b).* FROM `Google_CarlosAugusto.table1` a FULL OUTER JOIN `Google_CarlosAugusto.table1_staging` b ON a.id = b.id WHERE a.id IS NULL OR b.id IS NULL OR FARM_FINGERPRINT(FORMAT("%T", a)) <> FARM_FINGERPRINT(FORMAT("%T", b)) ) S ON T.id = S.id WHEN MATCHED AND S.op = 'U' THEN UPDATE SET col1 = S.col1, col2 = S.col2, col3 = S.col3, col4 = S.col4 WHEN MATCHED AND S.op = 'D' THEN DELETE WHEN NOT MATCHED THEN INSERT (id, col1, col2, col3, col4) VALUES (S.id, S.col1, S.col2, S.col3, S.col4) """.format(table="Google_CarlosAugusto.table1"), use_legacy_sql=False, dag=dag)
# 'start_date': airflow.utils.dates.days_ago(1) 'email': models.Variable.get('email'), 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } output_file ='gs://prp-target/address.csv' with airflow.DAG( dag_id='demo_bq_dag', schedule_interval = datetime.timedelta(days=1), default_args = default_dag_args) as dag: bq_airflow_commits_query = bigquery_operator.BigQueryOperator( task_id='bq_airflow_commits_query', bql = ''' SELECT * FROM [able-hull-251304.my_dataset.Address] ''') export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_airflow_commits_to_gcs', source_project_dataset_table = 'able-hull-251304.my_dataset.Address', destination_cloud_storage_uris = [output_file], export_format = 'CSV' ) bq_airflow_commits_query >> export_commits_to_gcs
default_dag_args = { # Setting start date as yesterday starts the DAG immediately when it is # detected in the Cloud Storage bucket. 'start_date': yesterday, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG('composer_sample_connections', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [START composer_connections_default_airflow_1] task_default = bigquery_operator.BigQueryOperator( task_id='task_default_connection', sql='SELECT 1', use_legacy_sql=False) # [END composer_connections_default_airflow_1] # [START composer_connections_explicit_airflow_1] task_explicit = bigquery_operator.BigQueryOperator( task_id='task_explicit_connection', sql='SELECT 1', use_legacy_sql=False, # Composer creates a 'google_cloud_default' connection by default. bigquery_conn_id='google_cloud_default') # [END composer_connections_explicit_airflow_1] # [START composer_connections_custom_airflow_1] task_custom = bigquery_operator.BigQueryOperator( task_id='task_custom_connection', sql='SELECT 1', use_legacy_sql=False,
make_bq_dataset = bash_operator.BashOperator( task_id='make_bq_dataset', # Executing 'bq' command requires Google Cloud SDK which comes # preinstalled in Cloud Composer. bash_command='bq ls {} || bq mk {}'.format( bq_dataset_name, bq_dataset_name)) # [END composer_bash_bq] # [START composer_bigquery] # Query recent StackOverflow questions. bq_recent_questions_query = bigquery_operator.BigQueryOperator( task_id='bq_recent_questions_query', sql=""" SELECT owner_display_name, title, view_count FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE creation_date < CAST('{max_date}' AS TIMESTAMP) AND creation_date >= CAST('{min_date}' AS TIMESTAMP) ORDER BY view_count DESC LIMIT 100 """.format(max_date=max_query_date, min_date=min_query_date), use_legacy_sql=False, destination_dataset_table=bq_recent_questions_table_id) # [END composer_bigquery] # Export query result to Cloud Storage. export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_recent_questions_to_gcs', source_project_dataset_table=bq_recent_questions_table_id, destination_cloud_storage_uris=[output_file], export_format='CSV') # Perform most popular question query.
task_id='Run_Fact_Actuary_Staging', bash_command= f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline Fact_Actuary_Staging --dataset_meta {dataset_meta}", dag=Areaman_as_dag) t1 = BashOperator( task_id='Run_Fact_Actuary', bash_command= f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline Fact_Actuary --dataset_meta {dataset_meta}", dag=Areaman_as_dag) # To produce DB_Run_Off_Fields table t3 = BashOperator( task_id='Run_DB_Run_Off', bash_command= f"python3 /home/airflow/gcs/dags/dp_exec/DIM_EXEC2.py --pipeline DB_Run_Off --dataset_meta {dataset_meta}", dag=Areaman_as_dag) t100 = bigquery_operator.BigQueryOperator( task_id='Fact_Actuary_Prestaging', sql=f""" CREATE TABLE `geb-dwh-test.uat_geb_dwh_eu_act.fact_actuary_w_o_null` AS SELECT * FROM `geb-dwh-test.uat_geb_dwh_eu_act.source_fact_actuary_staging` WHERE LOCALINSURER IS NOT NULL ORDER BY LOCALINSURER,CARRIERREFERENCE, RE_BALANCEYEARPTF, RE_QUARTERPTF """, use_legacy_sql=False, dag=Areaman_as_dag) t00 >> t0 >> t1 >> t100 >> t9 >> t99 # t00 >> t0 >> t01 >> t02 >> t1 >>
project_id='umg-spinnup', dataset_id='spotify', table_id='spotify_trends${{ ds_nodash }}', row_count=100000, poke_interval=600, bigquery_conn_id='bigquery_default', pool='sensors', dag=dag ) # Generate dataset generate_dataset = bigquery_operator.BigQueryOperator( task_id='generate_dataset', sql='sql/spinnup/fraud_detection_v02_data.sql', destination_dataset_table='umg-data-science.detect_fraud_spinnup.v02_data${{ ds_nodash }}', write_disposition='WRITE_TRUNCATE', allow_large_results=True, bigquery_conn_id='bigquery_default', use_legacy_sql=False, dag=dag ) # Make a temp directory mkdir = BashOperator( task_id='mkdir', bash_command='mkdir -p {}'.format(DIR), dag=dag ) # Download python script from GCS download_script = gcs_download_operator.GoogleCloudStorageDownloadOperator( task_id='download_python_script',
skip_leading_rows=1, destination_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'data_source'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag ) # Clean the data from BigQuery to BigQuery t2 = bigquery_operator.BigQueryOperator( task_id='bq_from_source_to_clean', bql='{}/common/clean.sql'.format(GCS_SQL), use_legacy_sql=False, allow_large_results=True, destination_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'data_cleaned'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag ) # Creates split between features and targets and also aggregates both sides. # The threshold date is passed as an arg when calling the Airflow job and # dynamically understood within the .sql file. # We should pass query_params but we run into various problems: # - if using BigQueryOperator, we can not pass dag_run.conf['threshold_date'] # - if using hooks, run_query does not accept a .sql file and needs the string # So the way is to add directly {{ dag_run.conf['threshold_date'] }} into the # .sql file which Airflow can ping up when running the operator. t3 = bigquery_operator.BigQueryOperator(
raw_sql_files = read_sql_from_gcs(bq_raw_dataset_name, gcs_bucket) bq_start_making_raw_tables = dummy_operator.DummyOperator( task_id='start_making_raw_tables') bq_end_making_raw_tables = dummy_operator.DummyOperator( task_id='end_making_raw_tables') for filename in raw_sql_files: sql_statement = raw_sql_files[filename].decode() table_name = filename.replace('.sql', '') table_name = table_name.replace('raw/', '') bq_make_raw_tables = bigquery_operator.BigQueryOperator( task_id='make_raw_table_{}'.format(table_name), sql=sql_statement, use_legacy_sql=False, location='asia-southeast1') bq_start_making_raw_tables >> bq_make_raw_tables bq_make_raw_tables >> bq_end_making_raw_tables bq_make_raw_dataset >> bq_start_making_raw_tables bq_make_tmp_dataset = bash_operator.BashOperator( task_id='make_bq_tmp_dataset', bash_command= 'bq --location=asia-southeast1 ls {} || bq --location=asia-southeast1 mk {}' .format(bq_tmp_dataset_name, bq_tmp_dataset_name)) bq_start_making_tmp_tables = dummy_operator.DummyOperator( task_id='start_making_tmp_tables')
'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'composer_sample_bq_notify', schedule_interval=datetime.timedelta(weeks=4), default_args=default_dag_args) as dag: # Query recent today's data bq_fetch_todays_data = bigquery_operator.BigQueryOperator( task_id='bq_fetch_todays_data', bql=""" SELECT * FROM `camel-154800.chicago_historical_congestion_data.Aug2_Larger_dataset_view` WHERE FULL_DATE=CURRENT_DATE("America/Chicago") ORDER BY HH_hour DESC """.format(max_date=max_query_date, min_date=min_query_date), use_legacy_sql=False, destination_dataset_table=bq_temp_composer_dataset) # [END composer_bigquery] # [START composer_bigquery] # Export query result to Cloud Storage. export_output_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_output_to_gcs', source_project_dataset_table=bq_temp_composer_dataset, destination_cloud_storage_uris=[output_file], export_format='CSV')
default_args = { 'owner': 'Nitin Ware', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(1), } dag = DAG( 'bq_conn_dag', 'catchup=False', default_args=default_args, schedule_interval="@once", ) task_default = bigquery_operator.BigQueryOperator( task_id='task_default_connection', bql='SELECT 1', use_legacy_sql=False, dag=dag) task_explicit = bigquery_operator.BigQueryOperator( task_id='task_explicit_connection', bql='SELECT 1', use_legacy_sql=False, bigquery_conn_id='google_cloud_default', dag=dag) task_custom = bigquery_operator.BigQueryOperator( task_id='task_custom_connection', bql='SELECT 1', use_legacy_sql=False, bigquery_conn_id='my_gcp_connection',
_wait_for_completion = wait_for_completion_operator.WaitForCompletionOperator( task_id=_WAIT_FOR_COMPLETION_TASK, dag=dag, project_id=_PROJECT_ID, queue_location=_QUEUE_LOCATION, queue_name=_QUEUE_NAME, service_account_path=_SERVICE_ACCOUNT, try_count_limit=_TRY_COUNT_LIMIT) _update_expiration_tracking = bigquery_operator.BigQueryOperator( task_id=_UPDATE_EXPIRATION_TRACKING, dag=dag, sql=_UPDATE_ITEMS_EXPIRATION_TRACKING_QUERY, use_legacy_sql=False, params={ 'timezone': _TIMEZONE_UTC_OFFSET, 'feed_dataset_id': _FEED_DATASET_ID, 'expiration_tracking_table_id': _EXPIRATION_TRACKING_TABLE_ID, 'monitor_dataset_id': _MONITOR_DATASET_ID, 'item_results_table_id': _ITEM_RESULTS_TABLE_ID }) _bq_to_pubsub = bq_to_pubsub_operator.GetRunResultsAndTriggerReportingOperator( task_id=_TRIGGER_REPORTING_TASK, dag=dag, project_id=_PROJECT_ID, dataset_id=_MONITOR_DATASET_ID, table_id=_MONITOR_TABLE_ID, query_file_path=_RESULT_QUERY_FILE, topic_name=_TOPIC_NAME)