def test_execute(self, mock_hook): operator = BigQueryCreateExternalTableOperator( task_id=TASK_ID, destination_project_dataset_table='{}.{}'.format( TEST_DATASET, TEST_TABLE_ID ), schema_fields=[], bucket=TEST_GCS_BUCKET, source_objects=TEST_GCS_DATA, source_format=TEST_SOURCE_FORMAT ) operator.execute(None) mock_hook.return_value \ .create_external_table \ .assert_called_once_with( external_project_dataset_table='{}.{}'.format( TEST_DATASET, TEST_TABLE_ID ), schema_fields=[], source_uris=['gs://{}/{}'.format(TEST_GCS_BUCKET, source_object) for source_object in TEST_GCS_DATA], source_format=TEST_SOURCE_FORMAT, compression='NONE', skip_leading_rows=0, field_delimiter=',', max_bad_records=0, quote_character=None, allow_quoted_newlines=False, allow_jagged_rows=False, src_fmt_configs={}, labels=None, encryption_configuration=None )
create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table", bucket=BUCKET_NAME, table_resource={ "tableReference": { "projectId": GCP_PROJECT_ID, "datasetId": DATASET_NAME, "tableId": "firestore_data", }, "schema": { "fields": [ { "name": "name", "type": "STRING" }, { "name": "post_abbr", "type": "STRING" }, ] }, "externalDataConfiguration": { "sourceFormat": "DATASTORE_BACKUP", "compression": "NONE", "csvOptions": { "skipLeadingRows": 1 }, }, }, source_objects=[ f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}" f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata" ], )
location=DATASET_LOCATION, project_id=GCP_PROJECT_ID, ) delete_dataset = BigQueryDeleteDatasetOperator(task_id="delete_dataset", dataset_id=DATASET_NAME, project_id=GCP_PROJECT_ID, delete_contents=True) # [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table", bucket=BUCKET_NAME, source_objects=[ f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}" f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata" ], source_format="DATASTORE_BACKUP", destination_project_dataset_table= f"{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator( task_id="execute_query", sql= f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`", use_legacy_sql=False, ) # Firestore
# [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table", bucket=BUCKET_NAME, table_resource={ "tableReference": { "projectId": GCP_PROJECT_ID, "datasetId": DATASET_NAME, "tableId": "firestore_data", }, "schema": { "fields": [ { "name": "name", "type": "STRING" }, { "name": "post_abbr", "type": "STRING" }, ] }, "externalDataConfiguration": { "sourceFormat": "DATASTORE_BACKUP", "compression": "NONE", "csvOptions": { "skipLeadingRows": 1 }, }, }, ) # [END howto_operator_create_external_table_multiple_types]
max_results="10", selected_fields="value,to_address", ) get_data_result = BashOperator( task_id="get_data_result", bash_command="echo \"{{ task_instance.xcom_pull('get-data') }}\"") create_external_table = BigQueryCreateExternalTableOperator( task_id="create_external_table", bucket=DATA_SAMPLE_GCS_BUCKET_NAME, source_objects=[DATA_SAMPLE_GCS_OBJECT_NAME], destination_project_dataset_table="{}.external_table".format( DATASET_NAME), skip_leading_rows=1, schema_fields=[{ "name": "name", "type": "STRING" }, { "name": "post_abbr", "type": "STRING" }], ) execute_query_external_table = BigQueryExecuteQueryOperator( task_id="execute_query_external_table", destination_dataset_table="{}.selected_data_from_external_table". format(DATASET_NAME), sql='SELECT * FROM `{}.external_table` WHERE name LIKE "W%"'.format( DATASET_NAME), use_legacy_sql=False,
) # [END howto_operator_presto_to_gcs_multiple_types] # [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table_multiple_types", bucket=GCS_BUCKET, source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"], table_resource={ "tableReference": { "projectId": GCP_PROJECT_ID, "datasetId": DATASET_NAME, "tableId": f"{safe_name(SOURCE_MULTIPLE_TYPES)}", }, "schema": { "fields": [ {"name": "name", "type": "STRING"}, {"name": "post_abbr", "type": "STRING"}, ] }, "externalDataConfiguration": { "sourceFormat": "NEWLINE_DELIMITED_JSON", "compression": "NONE", "csvOptions": {"skipLeadingRows": 1}, }, }, schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryInsertJobOperator( task_id="read_data_from_gcs_multiple_types",
# [START howto_operator_bigquery_create_external_table] create_external_table = BigQueryCreateExternalTableOperator( task_id="create_external_table", table_resource={ "tableReference": { "projectId": PROJECT_ID, "datasetId": DATASET_NAME, "tableId": "external_table", }, "schema": { "fields": [ { "name": "name", "type": "STRING" }, { "name": "post_abbr", "type": "STRING" }, ] }, "externalDataConfiguration": { "sourceFormat": "CSV", "compression": "NONE", "csvOptions": { "skipLeadingRows": 1 }, "sourceUris": [DATA_SAMPLE_GCS_URL], }, }, ) # [END howto_operator_bigquery_create_external_table]
presto_to_gcs_multiple_types = PrestoToGCSOperator( task_id="presto_to_gcs_multiple_types", sql=f"select * from {SOURCE_MULTIPLE_TYPES}", bucket=GCS_BUCKET, filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json", schema_filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", gzip=False, ) # [END howto_operator_presto_to_gcs_multiple_types] # [START howto_operator_create_external_table_multiple_types] create_external_table_multiple_types = BigQueryCreateExternalTableOperator( task_id="create_external_table_multiple_types", bucket=GCS_BUCKET, source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"], source_format="NEWLINE_DELIMITED_JSON", destination_project_dataset_table= f"{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}", schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator( task_id="read_data_from_gcs_multiple_types", sql= f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}`", use_legacy_sql=False, ) # [START howto_operator_presto_to_gcs_many_chunks] presto_to_gcs_many_chunks = PrestoToGCSOperator(
}, ) # TODO: Homework - research and try XCOM to communicate output values between 2 tasks/operators local_to_gcs_task = PythonOperator( task_id="local_to_gcs_task", python_callable=upload_to_gcs, op_kwargs={ "bucket": BUCKET, "object_name": f"raw/{parquet_file}", "local_file": f"{path_to_local_home}/{parquet_file}", }, ) bigquery_external_table_task = BigQueryCreateExternalTableOperator( task_id="bigquery_external_table_task", table_resource={ "tableReference": { "projectId": PROJECT_ID, "datasetId": BIGQUERY_DATASET, "tableId": "external_table", }, "externalDataConfiguration": { "sourceFormat": "PARQUET", "sourceUris": [f"gs://{BUCKET}/raw/{parquet_file}"], }, }, ) download_dataset_task >> format_to_parquet_task >> local_to_gcs_task >> bigquery_external_table_task