def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator: """Pipeline to transfer finally transferable output to GCS. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. Returns: Operator to use within a DAG to run the Pipeline for moving records to GCS. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) final_output_uri = '{path}/result-{timestamp}-*.json'.format( path=storage_vars['gcs_output_path'], timestamp=int(time.time())) final_output_table = '{project}.{dataset}.final_output'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) return bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id=task_id, source_project_dataset_table=final_output_table, destination_cloud_storage_uris=[final_output_uri], export_format='NEWLINE_DELIMITED_JSON', dag=dag)
from airflow.contrib.operators import bigquery_to_gcs from airflow.utils import trigger_rule default_dag_args = { 'start_date': airflow.utils.dates.days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), } output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv' #Replace <Your bucket> with your path details with DAG(dag_id='demo_bq_dag', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: bq_airflow_commits_query = bigquery_operator.BigQueryOperator( task_id='bq_airflow_commits_query', bql=""" SELECT Address FROM [airflow-studies:Address.Add] """) export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_airflow_commits_to_gcs', source_project_dataset_table='airflow-studies:Address.Add', destination_cloud_storage_uris=[output_file], export_format='CSV') bq_airflow_commits_query >> export_commits_to_gcs
# Get the table list from master file all_records = read_table_list(table_list_file_path) # Loop over each record in the 'all_records' python list to build up # Airflow tasks for record in all_records: logger.info('Generating tasks to transfer table: {}'.format(record)) table_source = record['table_source'] table_dest = record['table_dest'] BQ_to_GCS = bigquery_to_gcs.BigQueryToCloudStorageOperator( # Replace ":" with valid character for Airflow task task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")), source_project_dataset_table=table_source, destination_cloud_storage_uris=[ '{}-*.avro'.format('gs://' + source_bucket + '/' + table_source) ], export_format='AVRO') GCS_to_GCS = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( # Replace ":" with valid character for Airflow task task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")), source_bucket=source_bucket, source_object='{}-*.avro'.format(table_source), destination_bucket=dest_bucket, # destination_object='{}-*.avro'.format(table_dest) ) GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
# query_device_opsbrowser_snapshot = bigquery_operator.BigQueryOperator( # task_id='query_device_opsbrowser_snapshot', # bql=pathlib.Path(galileo.DAGS_DIR+"/bq_scripts_doi/dta_sql_device_opsbrowser_daily_snapshot_full_doi").read_text(), use_legacy_sql=False) # query_device_opsbrowser_delta_snapshot = bigquery_operator.BigQueryOperator( # task_id='query_device_opsbrowser_delta_snapshot', # bql=pathlib.Path(galileo.DAGS_DIR + "/bq_scripts_doi/dta_sql_device_opsbrowser_daily_snapshot_delta_doi").read_text(), use_legacy_sql=False) # ============================================================================================================ # Export datasets # pageviews snapshot export_bq_to_gcs_json = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_json', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_delta_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/json/%s.json" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'pgviews_daily_snapshot_doi') ], export_format='NEWLINE_DELIMITED_JSON') export_bq_to_gcs_csv = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_csv', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_delta_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/csv/%s.csv" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
sql=""" SELECT owner_display_name, title, view_count FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE creation_date < CAST('{max_date}' AS TIMESTAMP) AND creation_date >= CAST('{min_date}' AS TIMESTAMP) ORDER BY view_count DESC LIMIT 100 """.format(max_date=max_query_date, min_date=min_query_date), use_legacy_sql=False, destination_dataset_table=bq_recent_questions_table_id) # [END composer_bigquery] # Export query result to Cloud Storage. export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_recent_questions_to_gcs', source_project_dataset_table=bq_recent_questions_table_id, destination_cloud_storage_uris=[output_file], export_format='CSV') # Perform most popular question query. bq_most_popular_query = bigquery_operator.BigQueryOperator( task_id='bq_most_popular_question_query', sql=""" SELECT title, view_count FROM `{table}` ORDER BY view_count DESC LIMIT 1 """.format(table=bq_recent_questions_table_id), use_legacy_sql=False, destination_dataset_table=bq_most_popular_table_id)
""" query_webcrawl_ga_index = bigquery_operator.BigQueryOperator( task_id='query_webcrawl_ga_index', bql=query, use_legacy_sql=False, params={ 'project_id': project_id, 'temp_table': temp_table, }) export_webcrawl_ga_index_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_webcrawl_ga_index_to_gcs', source_project_dataset_table= "{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}", params={ 'project_id': project_id, 'temp_table': temp_table }, destination_cloud_storage_uris=[ "gs://%s/data/%s.csv" % (models.Variable.get( 'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table) ], export_format='CSV') query_webcrawl_ga_index >> export_webcrawl_ga_index_to_gcs compare_webcrawl_ga_accounts = python_operator.PythonOperator( task_id='compare_webcrawl_ga_with_accounts', python_callable=compare_webcrawl_ga_with_accounts) generate_account_index >> compare_webcrawl_ga_accounts export_webcrawl_ga_index_to_gcs >> compare_webcrawl_ga_accounts if __name__ == '__main__':
query_device_opsbrowser_snapshot_intraday = bigquery_operator.BigQueryOperator( task_id='query_device_opsbrowser_snapshot_intraday', bql=pathlib.Path( galileo.DAGS_DIR + "/bq_scripts_doi/bq_scripts_delta/dta_sql_device_opsbrowser_snapshot_delta_intraday_doi" ).read_text(), use_legacy_sql=False) # =================================================================================================== # Export datasets # pageviews snapshot export_bq_to_gcs_json_pgviews = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_json_pgviews', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_intraday_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/delta_snapshot/%s.json" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'pgviews_daily_snapshot_doi_' + str(date.today()) + '_intraday') ], export_format='NEWLINE_DELIMITED_JSON') export_bq_to_gcs_csv_pgviews = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_csv_pgviews', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_intraday_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/delta_snapshot/%s.csv" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
schedule_interval=datetime.timedelta(weeks=4), default_args=default_dag_args) as dag: # Query recent today's data bq_fetch_todays_data = bigquery_operator.BigQueryOperator( task_id='bq_fetch_todays_data', bql=""" SELECT * FROM `camel-154800.chicago_historical_congestion_data.Aug2_Larger_dataset_view` WHERE FULL_DATE=CURRENT_DATE("America/Chicago") ORDER BY HH_hour DESC """.format(max_date=max_query_date, min_date=min_query_date), use_legacy_sql=False, destination_dataset_table=bq_temp_composer_dataset) # [END composer_bigquery] # [START composer_bigquery] # Export query result to Cloud Storage. export_output_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_output_to_gcs', source_project_dataset_table=bq_temp_composer_dataset, destination_cloud_storage_uris=[output_file], export_format='CSV') # Delete BigQuery dataset # Delete the bq table #make_bq_dataset bq_fetch_todays_data >> export_output_to_gcs
create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag ) # TODO: Currently all data steps are done whether BTYD or DNN are used. It would # be better to have a condition to call only one task or the other using 'model_type' data_btyd_location = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'btyd.csv')] data_train_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'train.csv')] data_eval_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'eval.csv')] data_test_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'test.csv')] t5a = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='bq_dnn_train_to_gcs', source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_train'), destination_cloud_storage_uris=data_train_locations, print_header=False, dag=dag ) t5b = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='bq_dnn_eval_to_gcs', source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_eval'), destination_cloud_storage_uris=data_eval_locations, print_header=False, dag=dag ) t5c = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='bq_dnn_test_to_gcs', source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_test'),
} output_file = 'gs://us-east1-example-environmen-3bb7fd7a-bucket/data/product.csv' dag = DAG( 'bq_export_dag', 'catchup=False', default_args=default_args, schedule_interval="@once", ) start_task = dummy_operator.DummyOperator( task_id='start', default_args=default_args, dag=dag, ) bq_query_task = bigquery_operator.BigQueryOperator( task_id='bq_query', bql='''SELECT * FROM [hd-personalization-dev.temp.Prod] ''', dag=dag) export_to_gcs_task = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_to_gcs', source_project_dataset_table='hd-personalization-dev.temp.Prod', destination_cloud_storage_uris=[output_file], export_format='CSV', dag=dag) start_task >> bq_query_task >> export_to_gcs_task
WHERE _TABLE_SUFFIX BETWEEN '{{params.start}}' AND '{{params.end}}' AND REGEXP_CONTAINS(page.pagePath, r"{{ params.search_param }}=(.*?)(?:&|$)") group by query order by count(*) desc """ query_internalsearch = bigquery_operator.BigQueryOperator( task_id='query_internalsearch_' + galileo.domain_slug(d['domain']), bql=query, use_legacy_sql=False, params={ 'project_id': project_id, 'view_id': d['view_id'], 'start': start, 'end': end, 'temp_table': temp_table, 'domain': d['domain'], 'search_param': d['search_param'] }) export_internalsearch_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_internalsearch_to_gcs_' + galileo.domain_slug(d['domain']), source_project_dataset_table="{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}", params= { 'project_id': project_id, 'temp_table': temp_table }, destination_cloud_storage_uris=[ "gs://%s/data/searchqueries/%s.csv" % ( models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table)], export_format='CSV') query_internalsearch >> export_internalsearch_to_gcs
task_id='query_device_brand_12months_daily_snapshot', bql=pathlib.Path( galileo.DAGS_DIR + "/bq_scripts_ursa_major/dta_sql_devicebrand_snapshot_12months_daily_doi" ).read_text(), use_legacy_sql=False) # ============================================================================================================ # ============================================================================================================ # Export datasets # total unique visitors 90 days snapshot export_bq_to_gcs_json_uniquevisitors = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_json_uniquevisitors', source_project_dataset_table= "{{params.project_id}}.dta_project_ursa_major.unique_visitors_90days_daily_delta_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/project_ursa_major/%s.json" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'uniquevisitors_90days_daily_snapshot_doi') ], export_format='NEWLINE_DELIMITED_JSON') export_bq_to_gcs_csv_uniquevisitors = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_csv_uniquevisitors', source_project_dataset_table= "{{params.project_id}}.dta_project_ursa_major.unique_visitors_90days_daily_delta_doi", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/project_ursa_major/%s.csv" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
task_id='query_digital_services_counter', bql=pathlib.Path( galileo.DAGS_DIR + "/bq_scripts_ursa_major/dta_sql_websites_count_doi").read_text(), use_legacy_sql=False) # ============================================================================================================ # ============================================================================================================ # Export datasets # total unique visitors 90 days snapshot export_bq_to_gcs_json_counter = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_json_counter', source_project_dataset_table= "{{params.project_id}}.dta_project_ursa_major.ga_monthly_websites_counter", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/project_ursa_major/%s.json" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'websites_agencies_monthly_counter') ], export_format='NEWLINE_DELIMITED_JSON') export_bq_to_gcs_csv_counter = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_csv_counter', source_project_dataset_table= "{{params.project_id}}.dta_project_ursa_major.ga_monthly_websites_counter", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/project_ursa_major/%s.csv" % (models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
UNNEST(hits) as hits WHERE hits.type="PAGE" ORDER BY fullVisitorId, visitId, visitNumber, hitNumber """ query_benchmark = bigquery_operator.BigQueryOperator( task_id='query_benchmark', bql=query, use_legacy_sql=False, params={ 'project_id': project_id, 'view_id': view_id, 'timestamp': timestamp, 'temp_table': temp_table }) export_benchmark_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_benchmark_to_gcs', source_project_dataset_table="%s.tmp.%s" % (project_id, temp_table), destination_cloud_storage_uris=[ "gs://%s/data/%s.csv" % (models.Variable.get( 'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table) ], export_format='CSV') query_benchmark >> export_benchmark_to_gcs
,Security_Type_Child ,Security_Type_Parent ,Sector ,Trade_Date -- Account Information ,Account_Type ,Account_Name -- Financial Advisor Information ,FA_Code ,FA_Name -- Department ,Dept_Name ,Dept_City ,Dept_State""", use_legacy_sql=True, destination_dataset_table='gcp_flex_etl.trade_agg', write_disposition='WRITE_TRUNCATE', create_disposition='CREATE_IF_NEEDED', allow_large_results=True, dag=dag) ## Export trade_agg table to GCS export_trade_agg_table = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_trade_agg_table', source_project_dataset_table='gcp_flex_etl.trade_agg', destination_cloud_storage_uris=[current_bucket], export_format='CSV', dag=dag) start >> archive_old_data >> load_account_current >> load_trade_agg >> export_trade_agg_table >> end start >> archive_old_data >> load_financial_advisor_current >> load_trade_agg >> export_trade_agg_table >> end
use_legacy_sql=False) query_pageviews_snapshot_delta = bigquery_operator.BigQueryOperator( task_id='query_pageviews_snapshot_delta', bql=pathlib.Path( galileo.DAGS_DIR + "/bq_scripts/dta_sql_pgvw_daily_snapshot_incremental").read_text(), use_legacy_sql=False) # Export data from BQ to GCS export_bq_to_gcs_json = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_json', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_increment", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/%s.json" % (models.Variable.get( 'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'pgviews_daily_snapshot') ], export_format='NEWLINE_DELIMITED_JSON') export_bq_to_gcs_csv = bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id='export_bq_to_gcs_csv', source_project_dataset_table= "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_increment", params={'project_id': project_id}, destination_cloud_storage_uris=[ "gs://%s/data/analytics/%s.csv" % (models.Variable.get( 'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), 'pgviews_daily_snapshot')