def bqimport(ts, dag, tablename, schedule, **kwargs): pg_conn = config[schedule][tablename][1] table_schema = tablename.split('.')[1] table_name = tablename.split('.')[2] export_datetime = ts sync_interval = schedule bqload = GoogleCloudStorageToBigQueryOperator( task_id='func_bqload_{}'.format(table_name), bucket='prod-data-sync-bucket', destination_project_dataset_table=table_schema + '.tbl_' + table_name, create_disposition='CREATE_IF_NEEDED', source_format='csv', field_delimiter='|', autodetect=False, schema_object='json_schema/' + table_schema + '_' + table_name + '.json', source_objects=[ 'mergeload/data/' + sync_interval + '/' + ts + '/' + table_schema + '_' + table_name + '_' + pg_conn + '/*.csv' ], quote_character='"', allow_quoted_newlines=True, allow_jagged_rows=True, write_disposition='WRITE_APPEND', bigquery_conn_id='bigquery_default', google_cloud_storage_conn_id='google_cloud_storage_default', skip_leading_rows=1, dag=dag) bqload.execute(None)
def build_gcs_to_bq_task(dag, data_type): ''' Creates a task to load a file from Google Cloud Storage into BigQuery. The name of the file being loaded is retrieved through Airflow's Xcom. Data types should be: 'accounts', 'ledgers', 'offers', 'operations', 'trades', 'transactions', or 'trustlines'. Parameters: dag - parent dag that the task will be attached to data_type - type of the data being uploaded; should be string Returns: the newly created task ''' bucket_name = Variable.get('gcs_exported_data_bucket_name') project_name = Variable.get('bq_project') dataset_name = Variable.get('bq_dataset') table_ids = Variable.get('table_ids', deserialize_json=True) prev_task_id = f'load_{data_type}_to_gcs' schema_fields = read_local_schema(data_type) return GoogleCloudStorageToBigQueryOperator( task_id=f'send_{data_type}_to_bq', google_cloud_storage_conn_id='google_cloud_platform_connection', bigquery_conn_id='google_cloud_platform_connection', bucket=bucket_name, schema_fields=schema_fields, autodetect=False, source_format='NEWLINE_DELIMITED_JSON', source_objects=[ "{{ task_instance.xcom_pull(task_ids='" + prev_task_id + "') }}" ], destination_project_dataset_table= f'{project_name}.{dataset_name}.{table_ids[data_type]}', write_disposition='WRITE_APPEND', create_disposition='CREATE_IF_NEEDED', dag=dag, )
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")): bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}" bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}" t1 = GoogleCloudStorageToBigQueryOperator( task_id=f"staging_{name}", bucket="{{var.value.gcs_bucket}}", source_objects=[f"{name}*"], destination_project_dataset_table=bq_staging, write_disposition="WRITE_TRUNCATE", schema_fields=schema, skip_leading_rows=1, ) t2 = BigQueryOperator( task_id=f"merge_{name}_into_warehouse", sql=_create_merge_sql(bq_staging, bq_warehouse, schema), use_legacy_sql=False, ) t3 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=f"move_{name}_to_processed", source_bucket="{{var.value.gcs_bucket}}", source_object=f"{name}*", destination_bucket="{{var.value.gcs_bucket}}", destination_object=f"processed/{name}", move_object=True, ) t1 >> t2 >> t3 >> next_task return t1
def move_from_gcs_to_bq(**context): gcs_hook = GoogleCloudStorageHook() file_list = get_bucket_file_names(gcs_hook) destination_table = '{}.raw.raw'.format(env.BQ_PROJECT) for file in file_list: gcs_to_bq_operator = GoogleCloudStorageToBigQueryOperator( task_id='temp', bigquery_conn_id='google_cloud_default', bucket=env.GCS_BUCKET, source_objects=[file], destination_project_dataset_table=destination_table, write_disposition='WRITE_APPEND', schema_fields=raw_schema, skip_leading_rows=1) gcs_to_bq_operator.execute(context) gcs_hook.delete(bucket=env.GCS_BUCKET, object=file)
def test_execute_explicit_project(self, bq_hook): operator = GoogleCloudStorageToBigQueryOperator( task_id=TASK_ID, bucket=TEST_BUCKET, source_objects=TEST_SOURCE_OBJECTS, destination_project_dataset_table=TEST_EXPLICIT_DEST, max_id_key=MAX_ID_KEY) # using non-legacy SQL bq_hook.return_value.get_conn.return_value.cursor.return_value.use_legacy_sql = False operator.execute(None) bq_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .execute \ .assert_called_once_with("SELECT MAX(id) FROM `test-project.dataset.table`")
def gen_import_table_task(table_config): import_task = GoogleCloudStorageToBigQueryOperator( task_id='{}_to_bigquery'.format(table_config.params['export_table']), bucket=table_config.params['export_bucket'], source_objects=["cloudsql_to_bigquery/{}/{}*".format(table_config.params['export_database'], table_config.params['export_table'])], destination_project_dataset_table="{}.{}.{}".format(table_config.params['gcp_project'], table_config.params['stage_dataset'], table_config.params['stage_table']), schema_object="cloudsql_to_bigquery/schema/{}/schema_raw".format(table_config.params['export_table']), write_disposition='WRITE_TRUNCATE', source_format="NEWLINE_DELIMITED_JSON", dag=dag) import_task.doc_md = """\ #### Import table from storage to bigquery task documentation """ return import_task
def createTaskHelper(table): return GoogleCloudStorageToBigQueryOperator( task_id = 'create_staging_{0}'.format(table), field_delimiter = '|', schema_object = 'schema/{0}.json'.format(table), source_objects = ['{0}/{1}.dat'.format('dimension',table.lower())], bucket = 'da304-staging', destination_project_dataset_table = "{0}.{1}".format(BQ_STAGING_DATASET_NAME,table), external_table = True, dag=dag_daily)
def createTaskHelper(table): return GoogleCloudStorageToBigQueryOperator( task_id = 'create_staging_{0}'.format(table), skip_leading_rows=1, field_delimiter = '|', schema_object = 'schema/{0}.json'.format(table), source_objects = ['{0}/{1}/{2}.csv'.format('facts',job_run_date,table)], bucket = 'da304-staging', destination_project_dataset_table = "{0}.{1}".format(BQ_STAGING_DATASET_NAME,table), external_table = True, dag=dag_daily)
def Import_From_File(proj, dset, tab, bucket, path, infile): return GoogleCloudStorageToBigQueryOperator( task_id='import_to_' + tab, bucket=bucket, source_objects=[path + '/' + infile], field_delimiter=',', skip_leading_rows=1, allow_quoted_newlines=True, destination_project_dataset_table=proj + '.' + dset + '.' + tab, create_disposition='CREATE_NEVER', write_disposition='WRITE_TRUNCATE', schema_object='schema/' + dset + '/' + tab + '.json', aurodetect=False, trigger_rule='none_failed')
def construct_gcs_to_bq_operator( task_id: str, source_objects: List[str], schema_fields: List[Dict], destination_project_dataset_table: str ) -> GoogleCloudStorageToBigQueryOperator: return GoogleCloudStorageToBigQueryOperator( task_id=task_id, bucket=GCS_BUCKET, source_objects=source_objects, schema_fields=schema_fields, field_delimiter=get_delimiter(source_objects), destination_project_dataset_table=destination_project_dataset_table, write_disposition='WRITE_TRUNCATE', autodetect=False, bigquery_conn_id=BIG_QUERY_CONN_ID, google_cloud_storage_conn_id=GOOGLE_CLOUD_DEFAULT, ignore_unknown_values=False)
"type": "STRING", "name": "malepopulation", "mode": "NULLABLE", }, { "type": "STRING", "name": "totalpopulation", "mode": "NULLABLE", }, { "type": "String", "name": "year", "mode": "NULLABLE", }] from_gcs_to_bigQuery = GoogleCloudStorageToBigQueryOperator( task_id='import_data', bucket='us-central1-airflows-0e76080f-bucket/data', source_objects=['*.csv'], destination_project_dataset_table='yueairflow.Jason_Yue.population', source_format='CSV', field_delimiter=',', max_bad_records=10, skip_leading_rows=1, schema_fields=schema, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', dag=dag, bigquery_conn_id='google_cloud_default', trigger_rule='all_success') read_json_2013 >> write_csv_gcs >> from_gcs_to_bigQuery
bash_command= "bash /home/airflow/gcs/plugins/sqoop-incremental-imports.sh ephemeral-spark-cluster-{{ds_nodash}}", dag=dag) hdfs_to_gcs = BashOperator( task_id="hdfs_to_gcs", bash_command= "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'", dag=dag) bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_flight_delays", bucket="bigdata-etl-2_flights", source_objects=["sqoop_output/part.20190515_*.avro"], destination_project_dataset_table=PROJECT_ID + ".data_flights.flights_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) # delete_cluster = DataprocClusterDeleteOperator( # task_id='delete_dataproc_cluster', # cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", # region='asia-east1', # trigger_rule=TriggerRule.ALL_DONE # ) create_cluster.dag = dag
tasks = Variable.get('moat_tasks', deserialize_json=True) logging.info("Loaded Tasks") for tile, campaigns in tasks.items(): tile = int(tile) ##TODO: tile type on variable load start_task = DummyOperator(task_id='Start_{}'.format(tile), retries=0, dag=dag) store_task = GoogleCloudStorageToBigQueryOperator( task_id="Store_{}".format(tile), bucket='rtf_staging', source_objects=['{}/*'.format(tile)], destination_project_dataset_table= 'essence-analytics-dwh:RTF_DWH_Moat.{}_'.format(tile) + '{{ ds_nodash }}', schema_fields=moat_schemas.get(tile), source_format='NEWLINE_DELIMITED_JSON', create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', autodetect=False, dag=dag) for campaign in campaigns: level_filter, dimensions = campaign #unpack tuple filter_value = [*level_filter.values()][0] request_task = Moat_To_GCS( task_id="Req_{}_{}".format(tile, filter_value), brand_id=tile,
'additional-cloud-sql-instances': INSTANCE_NAME }, image_version='1.5') submit_sqoop = BashOperator( task_id="sqoop_full_table_import", bash_command= 'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}' ) bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_flight_delays", bucket="spark-etl-1", source_objects=["sqoop-output/flights/*.avro"], destination_project_dataset_table= "bigdata-etl-20201027.data_analysis.flight_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-central1", trigger_rule=TriggerRule.ALL_DONE) create_cluster.dag = dag create_cluster.set_downstream(submit_sqoop) submit_sqoop.set_downstream(bq_load_flight_delays)
def __new__( cls, parent_id, gcs_dirs_xcom, dst_dir, filename, schema_fields, table_name, task_id, dag, ): from airflow.utils.dates import days_ago args = { "start_date": days_ago(2), } bucket = get_bucket().replace("gs://", "", 1) full_table_name = format_table_name(table_name, is_staging=True) subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args) column_names = [schema["name"] for schema in schema_fields] # by convention, preface task names with dag_id op_col_select = PythonTaskflowOperator( task_id="select_cols", python_callable=_keep_columns, # note that this input should have form schedule/{execution_date}/... taskflow={ "gcs_dirs": { "dag_id": parent_id, "task_ids": gcs_dirs_xcom } }, op_kwargs={ "dst_dir": dst_dir, "filename": filename, "required_cols": [], "optional_cols": column_names, }, dag=subdag, ) op_stage_bq = GoogleCloudStorageToBigQueryOperator( task_id="stage_bigquery", bucket=bucket, # note that we can't really pull a list out of xcom without subclassing # operators, so we really on knowing that the task passing in # gcs_dirs_xcom data is using schedule/{execution_date} source_objects=[ "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename) ], schema_fields=schema_fields, destination_project_dataset_table=full_table_name, create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", # _keep_columns function includes headers in output skip_leading_rows=1, dag=subdag, ) op_col_select >> op_stage_bq return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)
arguments=[ "--gcs_path_raw={{ dag_run.conf['raw_path'] }}", "--gcs_path_transformed=gs://{{ var.value.gcs_bucket}}" + "/{{ dag_run.conf['transformed_path'] }}" ]) # Load the transformed files to a BigQuery table. bq_load = GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BigQuery', bucket='{{ var.value.gcs_bucket }}', # Wildcard for objects created by spark job to be written to BigQuery # Reads the relative path to the objects transformed by the spark job # from the POST message. source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"], destination_project_dataset_table='{{ var.value.bq_output_table }}', schema_fields=None, # Relative gcs path to schema file. schema_object='schemas/nyc-tlc-yellow.json', # Note that our spark job does json -> csv conversion. source_format='CSV', create_disposition='CREATE_IF_NEEDED', skip_leading_rows=0, write_disposition='WRITE_TRUNCATE', # If the table exists, overwrite it max_bad_records=0) # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', # Obviously needs to match the name of cluster created in the prior two # Operators. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
catchup=False, default_args=default_arguments, user_defined_macros={'project': PROJECT_ID}) as dag: list_files = PythonOperator( task_id='list_files', python_callable=list_objects, op_kwargs={'bucket': LANDING_BUCKET}, ) load_data = GoogleCloudStorageToBigQueryOperator( task_id='load_data', bucket=LANDING_BUCKET, source_objects=['*'], source_format='CSV', skip_leading_rows=1, destination_project_dataset_table='{{ project }}.analytics.history', create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', bigquery_conn_id='google_cloud_default', google_cloud_storage_conn_id='google_cloud_default') query = ''' SELECT * except(rank) FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY vehicle_id ORDER BY DATETIME(date, TIME(hour, minute, 0)) ) as rank FROM {{ project }}.analytics.history) as latest WHERE rank = 1;
[t4_US,t4_CA,...] >> t5_gamma >> t6_dummy ''' try : t5_prime_tableCheck=BigQueryCheckOperator(task_id='checkForTable', sql="SELECT COUNT(*) FROM `{}.{}.{}`".format(project_id,dataset_id,table_id), bigquery_conn_id=bigquery_conn_id, use_legacy_sql=False, dag=dag) t5_prime_tableCheck.set_upstream(writeToGCS_task) storageToBQ_task = GoogleCloudStorageToBigQueryOperator(task_id='uploadtoBQ_{}'.format(datetime.now().strptime('%Y%m%d_%H%M')), bucket=bucket_name, source_objects=tempGCS_dir_paths, destination_project_dataset="`{}.{}.{}`".format(project_id,dataset_id,table_id), schema_fields=schema, create_disposition='WRITE_TRUNCATE', dag=dag) storageToBQ_task.set_upstream(t5_prime_tableCheck) dummy_task.set_upstream(storageToBQ_task) except Exception as e : print("BigQueryCheck error = {}".format(e)) t5_gamme_tableCreate = BigQueryCreateExternalTableOperator(task_id='CreateBQtable', bucket=bucket_name, source_objects=tempGCS_dir_paths, destination_project_dataset_table="{}.{}.{}"format(project_id,dataset_id,dataset_id), schema=schema, dag=dag) dummy_task.set_upstream(t5_gamme_tableCreate)
) comp_aggregate = DataProcPySparkOperator( task_id='compute_aggregates', main= 'gs://europe-west1-training-airfl-159310f1-bucket/other/build_statistics_simple.py', cluster_name='analyse-pricing-{{ ds }}', arguments=["{{ ds }}"], dag=dag, ) del_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-20165e4959a78c1d', trigger_rule=TriggerRule.ALL_DONE, dag=dag, ) TABLE_THANG = 'airflowbolcom-20165e4959a78c1d:yolo.piet${{ ds_nodash }}' into_BQ = GoogleCloudStorageToBigQueryOperator( task_id="stuff_to_BQ", bucket='airflow_training_bucket', source_objects=['average_prices/transfer_date={{ ds }}/*.parquet'], destination_project_dataset_table=TABLE_THANG, source_format='PARQUET', write_disposition='WRITE_TRUNCATE', dag=dag) get_data >> create_cluster >> comp_aggregate >> del_cluster >> into_BQ
# ======================= # == load to bigquery === # ======================= bq_load_user = GoogleCloudStorageToBigQueryOperator( task_id='bq_load_user', bucket='exampleBucket', source_objects=["obj_folder/*"], source_format='NEWLINE_DELIMITED_JSON', destination_project_dataset_table='exampleSchema.exampleTable0', schema_fields=[ { 'name': 'user_id', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'first_timestamp', 'type': 'STRING', 'mode': 'NULLABLE' }, ], create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', dag=dag) bq_load_agg = GoogleCloudStorageToBigQueryOperator( task_id='bq_load_agg', bucket='exampleBucket', source_objects=["obj_folder/*"],
) submit_pyspark = DataProcPySparkOperator( task_id = "run_pyspark_etl", main = PYSPARK_JOB, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="asia-southeast2" ) bq_load_series_1 = GoogleCloudStorageToBigQueryOperator( task_id = "bq_load_series_1", bucket='bigdata-etl-2_flights', source_objects=["series_data_output/"+current_date+"_datamart_1/*.json"], destination_project_dataset_table= f'{PROJECT_ID}:qoala_test.series_datamart_1', autodetect = True, source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0, ) bq_load_series_2 = GoogleCloudStorageToBigQueryOperator( task_id = "bq_load_series_2", bucket='bigdata-etl-2_flights', source_objects=["series_data_output/"+current_date+"_datamart_2/*.json"], destination_project_dataset_table= f'{PROJECT_ID}:qoala_test.series_datamart_2', autodetect = True, source_format="NEWLINE_DELIMITED_JSON",
synthGCStoBQSync = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket='synth-info', source_objects=['synthinfo.csv'], schema_fields=[{ "mode": "NULLABLE", "name": "address", "type": "STRING" }, { "mode": "NULLABLE", "name": "collateralToken", "type": "STRING" }, { "mode": "NULLABLE", "name": "syntheticToken", "type": "STRING" }, { "mode": "NULLABLE", "name": "totalPositionCollateral", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "totalTokensOutstanding", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "timestamp", "type": "TIMESTAMP" }, { "mode": "NULLABLE", "name": "collateralTokenName", "type": "STRING" }, { "mode": "NULLABLE", "name": "syntheticTokenName", "type": "STRING" }], destination_project_dataset_table='uma.SynthInfo', skip_leading_rows=1, write_disposition="WRITE_APPEND", dag=dag)
'/firearms_dataflow.py'), dag=dag) beam_cleanup = BashOperator(task_id='firearms_beam_cleanup', bash_command=airflow_utils.beam_cleanup_statement( '{}_firearm_seizures'.format( os.environ['GCS_PREFIX'])), dag=dag) bq_insert_temp = GoogleCloudStorageToBigQueryOperator( task_id='firearms_bq_insert', destination_project_dataset_table='{}:firearm_seizures.seizures_temp'. format(os.environ['GCP_PROJECT']), bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']), source_objects=[ "avro_output/{}/{}/{}/*.avro".format(dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")) ], write_disposition='WRITE_APPEND', source_format='AVRO', time_partitioning={'type': 'DAY'}, dag=dag) bq_geo_join = BigQueryOperator( task_id='firearms_bq_geojoin', sql=build_revgeo_query('firearm_seizures', 'seizures_temp'), use_legacy_sql=False, destination_dataset_table='{}:firearm_seizures.seizures'.format( os.environ['GCP_PROJECT']), write_disposition='WRITE_APPEND', time_partitioning={'type': 'DAY'},
task_1 = GoogleCloudStorageToBigQueryOperator( task_id='load_price_quote_to_bigquery', bucket='bv-data', source_objects=['data/price_quote.csv'], destination_project_dataset_table='{}.{}.price_quotes'.format(project,dsIngestion), write_disposition='WRITE_TRUNCATE', skip_leading_rows=1, autodetect=False, schema_fields=[ { "name": "tube_assembly_id", "type": "STRING", "mode": "NULLABLE" }, { "name": "supplier", "type": "STRING", "mode": "NULLABLE" }, { "name": "quote_date", "type": "DATE", "mode": "NULLABLE" }, { "name": "annual_usage", "type": "INTEGER", "mode": "NULLABLE" }, { "name": "min_order_quantity", "type": "INTEGER", "mode": "NULLABLE" }, { "name": "bracket_pricing", "type": "BOOLEAN", "mode": "NULLABLE" }, { "name": "quantity", "type": "INTEGER", "mode": "NULLABLE" }, { "name": "cost", "type": "FLOAT", "mode": "NULLABLE" }], dag=dag)
dag = DAG(dag_nombre, default_args=default_dag_args, schedule_interval=schedule ) start = DummyOperator(task_id='start', dag=dag) # Se leen los nombres de los archivos ( en este caso adhoc son 3), se iteran y se almacenan en BigQuery for i, table_name in enumerate(params_bq_schema): gcs_to_bq = GoogleCloudStorageToBigQueryOperator( task_id = 'gcs_to_bq-{}'.format(table_name), bucket = params['BUCKET_ID'], allow_quoted_newlines = True, #Permite leer double quota "" skip_leading_rows = 1, # Para que no lea el header source_objects = ['data/{}.csv'.format(table_name)], destination_project_dataset_table = '{}:{}.{}'.format(params['GCP_PROJECT_ID'],params['BQ_DATASET_ID'],table_name), schema_fields = params_bq_schema[table_name], write_disposition = 'WRITE_TRUNCATE', dag = dag ) list_gcs_to_bq.append(gcs_to_bq) # Se lee la querie de las variables de airflow para obtener el dataset final agrupado y almacenar en BigQuery execute_bq_sql = BigQueryOperator( task_id='execute_bq_sql', sql= query_sql, use_legacy_sql=False, destination_dataset_table=bq_recent_questions_table_id,
"input": "gs://{bucket}/airflow-training-data/land_registry_price_paid_uk/{{{{ ds }}}}/" .format(bucket=bucket_name), "table": "first_result_table", "dataset": "airflow_dataset", }, dag=dag, ) results_to_bigquery = GoogleCloudStorageToBigQueryOperator( task_id='results_to_bigquery', bucket=bucket_name, source_objects=['airflow-training-data/results/{{ ds }}/part-*'], destination_project_dataset_table='airflow_dataset.results_{{ ds_nodash }}', source_format='PARQUET', # schema_fields=schemas.gsob(), create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', google_cloud_storage_conn_id='google_cloud_storage_default', bigquery_conn_id='bigquery_default', autodetect=True, dag=dag, ) [pgsl_to_gcs, http_to_gcs ] >> dataproc_create_cluster >> compute_aggregates >> [ dataproc_delete_cluster, load_into_bigquery, results_to_bigquery ]
t2 = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket='example-datasets', source_objects=['game_data_condensed.csv'], destination_project_dataset_table='{0}.gcp_example_table'.format( BQ_DATASET_NAME), schema_fields=[ { 'name': 'name', 'type': 'string', 'mode': 'nullable' }, { 'name': 'team', 'type': 'string', 'mode': 'nullable' }, { 'name': 'total_score', 'type': 'integer', 'mode': 'nullable' }, { 'name': 'timestamp', 'type': 'integer', 'mode': 'nullable' }, { 'name': 'window_start', 'type': 'string', 'mode': 'nullable' }, ], write_disposition='WRITE_TRUNCATE')
region="asia-east1", zone="asia-east1-a") submit_pyspark = DataProcPySparkOperator( task_id="run_pyspark_etl", main=PYSPARK_JOB, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="asia-east1") bq_load_delays_by_distance = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_avg_delays_by_distance", bucket='enr1qu319-data-engineer-1', source_objects=[ "flights_data_output/" + current_date + "_distance_category/part-*" ], destination_project_dataset_table=PROJECT_ID + ".data_analysis.avg_delays_by_distance", autodetect=True, source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) bq_load_delays_by_flight_nums = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_delays_by_flight_nums", bucket='enr1qu319-data-engineer-1', source_objects=[ "flights_data_output/" + current_date + "_flight_nums/part-*" ], destination_project_dataset_table=PROJECT_ID + ".data_analysis.avg_delays_by_flight_nums",
trigger_rule=trigger_rule.TriggerRule.ALL_DONE) load_parquet_bqt = GoogleCloudStorageToBigQueryOperator( task_id='load_parquet_bqt', bucket='dphivedb', source_objects=['HQL/PARQUET/*'], schema_fields=None, schema_object=None, source_format='parquet', destination_project_dataset_table='bqdataset.test3', bigquery_conn_id='bigquery_default', google_cloud_storage_conn_id='google_cloud_default', write_disposition='WRITE_TRUNCATE', autodetect=True, delegate_to=None, udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, location=None, encryption_configuration=None, dag=dag) rm_par_file = BashOperator(
], ) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT_ID, task_id="delete_dataproc_cluster", cluster_name="gcp-data-platform", trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator( task_id="gcs_to_bigquery", bucket=OUTPUT_BUCKET, source_objects=['{{ ds_format(ds, "%Y/%m/%d") }}/*.parquet'], destination_project_dataset_table= "{project_id}.events.events{{ ds_nodash }}".format( project_id=PROJECT_ID), source_format="PARQUET", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", ) explore_visits_by_hour = BigQueryOperator( task_id="explore_visits_by_hour", sql=""" SELECT FORMAT_DATETIME("%F %H:00:00", DATETIME(TIMESTAMP_SECONDS(CAST(timestamp AS INT64)))) AS ts, COUNT(1) AS num_visits FROM events.events WHERE url = '/explore' GROUP BY ts ORDER BY ts ASC