def main(): noaa_goes = BigQueryHelper(active_project="bigquery-public-data", dataset_name="noaa_goes16") noaa_goes.list_tables() noaa_goes.table_schema('abi_l1b_radiance') print(noaa_goes.head("abi_l1b_radiance", num_rows=10)) query = """ SELECT dataset_name, platform_id, scene_id FROM `bigquery-public-data.noaa_goes16.abi_l1b_radiance` WHERE geospatial_westbound_longitude<120 and geospatial_eastbound_longitude>75 and geospatial_northbound_latitude<50 and geospatial_southbound_latitude>30 """ print("Query size in GB is %f " % noaa_goes.estimate_query_size(query))
class TestBQHelper(unittest.TestCase): def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """ SELECT value FROM `bigquery-public-data.openaq.global_air_quality` WHERE value = {0}""" def test_list_tables(self): self.assertEqual(self.my_bq.list_tables(), ['global_air_quality']) def test_list_schema(self): self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11) def test_estimate_query_size(self): self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float) def test_query_to_pandas(self): self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame) def test_query_safe_passes(self): self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame) def test_query_safe_fails(self): # Different query must be used for this test to ensure we don't hit the # cache and end up passing by testing a query that would use zero bytes. fail_query = self.randomizable_query.format(random()) self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10)) def test_head(self): self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame) def test_useage_tracker(self): self.my_bq.query_to_pandas(self.randomizable_query.format(random())) self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0) def test_bad_query_raises_right_error(self): with self.assertRaises(BadRequest): self.my_bq.query_to_pandas("Not a valid query") def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
import pandas as pd import bq_helper from bq_helper import BigQueryHelper # https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="cms_medicare") bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare") bq_assistant.list_tables() bq_assistant.head("inpatient_charges_2015", num_rows=15)
import bq_helper from bq_helper import BigQueryHelper stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow") bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow") tabelas = bq_assistant.list_tables() esquemas_tabelas = {} query = """ SELECT Year, Tag, Total, Percent_Questions_with_Answers FROM (SELECT EXTRACT(YEAR FROM a.creation_date) as Year, t.tag_name as Tag, COUNT(1) as Total, ROUND(100 * SUM(IF(a.answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers FROM `bigquery-public-data.stackoverflow.posts_questions` a right JOIN `bigquery-public-data.stackoverflow.tags` t ON t.tag_name in UNNEST(SPLIT(a.tags,'|')) GROUP BY Year, Tag HAVING Year > 2019 AND Year < 2021 ORDER BY Total DESC LIMIT 20) ORDER BY Percent_Questions_with_Answers DESC """ response = stackOverflow.query_to_pandas_safe(query, max_gb_scanned=20) response.head(20) query1 = "SELECT tag_name as Assunto, count as Num_perguntas FROM `bigquery-public-data.stackoverflow.tags` order BY count DESC" response1 = stackOverflow.query_to_pandas_safe(query1, max_gb_scanned=20) response1.head(20) query2 = """SELECT Day_of_Week, COUNT(1) AS Num_Questions, SUM(answered_in_1h) AS Num_Answered_in_1H,
from bq_helper import BigQueryHelper import os credentials = GoogleCredentials.get_application_default() os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json' os.chdir(r'C:\Users\lukek\Desktop\SQL\huge datasets') GOOGLE_APPLICATION_CREDENTIALS = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json' # create a helper object for our bigquery dataset bqh = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="noaa_gsod") # build and run a series of queries to get annual temperatures for the US # WARNING: each year takes 5+ mins to run and the resultant dataset is about 100MB! weather = BigQueryHelper("bigquery-public-data", "noaa_gsod") weather.list_tables() import time start = time.time() START_YEAR = 2017 END_YEAR = 2020 for year in range(START_YEAR, END_YEAR): query = "SELECT stn,year,mo,da,temp,dewp,slp,stp,visib,wdsp,mxpsd,gust,max,min,prcp,sndp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,tornado_funnel_cloud FROM `bigquery-public-data.noaa_gsod.gsod{}`".format( year) df_wthr = bqh.query_to_pandas_safe(query, max_gb_scanned=5) filename = 'US_weather_{}.csv'.format(year) df_wthr.to_csv(filename, index=False) print("Saved {}".format(filename)) print('It took', time.time() - start, 'seconds.')