Esempio n. 1
0
def bqimport(ts, dag, tablename, schedule, **kwargs):
    pg_conn = config[schedule][tablename][1]
    table_schema = tablename.split('.')[1]
    table_name = tablename.split('.')[2]
    export_datetime = ts
    sync_interval = schedule
    bqload = GoogleCloudStorageToBigQueryOperator(
        task_id='func_bqload_{}'.format(table_name),
        bucket='prod-data-sync-bucket',
        destination_project_dataset_table=table_schema + '.tbl_' + table_name,
        create_disposition='CREATE_IF_NEEDED',
        source_format='csv',
        field_delimiter='|',
        autodetect=False,
        schema_object='json_schema/' + table_schema + '_' + table_name +
        '.json',
        source_objects=[
            'mergeload/data/' + sync_interval + '/' + ts + '/' + table_schema +
            '_' + table_name + '_' + pg_conn + '/*.csv'
        ],
        quote_character='"',
        allow_quoted_newlines=True,
        allow_jagged_rows=True,
        write_disposition='WRITE_APPEND',
        bigquery_conn_id='bigquery_default',
        google_cloud_storage_conn_id='google_cloud_storage_default',
        skip_leading_rows=1,
        dag=dag)
    bqload.execute(None)
Esempio n. 2
0
def move_from_gcs_to_bq(**context):
    gcs_hook = GoogleCloudStorageHook()
    file_list = get_bucket_file_names(gcs_hook)
    destination_table = '{}.raw.raw'.format(env.BQ_PROJECT)

    for file in file_list:
        gcs_to_bq_operator = GoogleCloudStorageToBigQueryOperator(
            task_id='temp',
            bigquery_conn_id='google_cloud_default',
            bucket=env.GCS_BUCKET,
            source_objects=[file],
            destination_project_dataset_table=destination_table,
            write_disposition='WRITE_APPEND',
            schema_fields=raw_schema,
            skip_leading_rows=1)
        gcs_to_bq_operator.execute(context)
        gcs_hook.delete(bucket=env.GCS_BUCKET, object=file)
Esempio n. 3
0
    def test_execute_explicit_project(self, bq_hook):
        operator = GoogleCloudStorageToBigQueryOperator(
            task_id=TASK_ID,
            bucket=TEST_BUCKET,
            source_objects=TEST_SOURCE_OBJECTS,
            destination_project_dataset_table=TEST_EXPLICIT_DEST,
            max_id_key=MAX_ID_KEY)

        # using non-legacy SQL
        bq_hook.return_value.get_conn.return_value.cursor.return_value.use_legacy_sql = False

        operator.execute(None)

        bq_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .execute \
            .assert_called_once_with("SELECT MAX(id) FROM `test-project.dataset.table`")