def bqimport(ts, dag, tablename, schedule, **kwargs): pg_conn = config[schedule][tablename][1] table_schema = tablename.split('.')[1] table_name = tablename.split('.')[2] export_datetime = ts sync_interval = schedule bqload = GoogleCloudStorageToBigQueryOperator( task_id='func_bqload_{}'.format(table_name), bucket='prod-data-sync-bucket', destination_project_dataset_table=table_schema + '.tbl_' + table_name, create_disposition='CREATE_IF_NEEDED', source_format='csv', field_delimiter='|', autodetect=False, schema_object='json_schema/' + table_schema + '_' + table_name + '.json', source_objects=[ 'mergeload/data/' + sync_interval + '/' + ts + '/' + table_schema + '_' + table_name + '_' + pg_conn + '/*.csv' ], quote_character='"', allow_quoted_newlines=True, allow_jagged_rows=True, write_disposition='WRITE_APPEND', bigquery_conn_id='bigquery_default', google_cloud_storage_conn_id='google_cloud_storage_default', skip_leading_rows=1, dag=dag) bqload.execute(None)
def move_from_gcs_to_bq(**context): gcs_hook = GoogleCloudStorageHook() file_list = get_bucket_file_names(gcs_hook) destination_table = '{}.raw.raw'.format(env.BQ_PROJECT) for file in file_list: gcs_to_bq_operator = GoogleCloudStorageToBigQueryOperator( task_id='temp', bigquery_conn_id='google_cloud_default', bucket=env.GCS_BUCKET, source_objects=[file], destination_project_dataset_table=destination_table, write_disposition='WRITE_APPEND', schema_fields=raw_schema, skip_leading_rows=1) gcs_to_bq_operator.execute(context) gcs_hook.delete(bucket=env.GCS_BUCKET, object=file)
def test_execute_explicit_project(self, bq_hook): operator = GoogleCloudStorageToBigQueryOperator( task_id=TASK_ID, bucket=TEST_BUCKET, source_objects=TEST_SOURCE_OBJECTS, destination_project_dataset_table=TEST_EXPLICIT_DEST, max_id_key=MAX_ID_KEY) # using non-legacy SQL bq_hook.return_value.get_conn.return_value.cursor.return_value.use_legacy_sql = False operator.execute(None) bq_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .execute \ .assert_called_once_with("SELECT MAX(id) FROM `test-project.dataset.table`")